# from tvm.script import ir as I # from tvm.script import tir as T # from tvm.script import relax as R @I.ir_module class Module: I.module_attrs({"external_mods": [metadata["runtime.Module"][0], metadata["runtime.Module"][1], metadata["runtime.Module"][2], metadata["runtime.Module"][3], metadata["runtime.Module"][4], metadata["runtime.Module"][5], metadata["runtime.Module"][6], metadata["runtime.Module"][7], metadata["runtime.Module"][8], metadata["runtime.Module"][9], metadata["runtime.Module"][10], metadata["runtime.Module"][11], metadata["runtime.Module"][12], metadata["runtime.Module"][13], metadata["runtime.Module"][14]]}) @T.prim_func def NT_matmul(layer_norm356: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), NT_matmul: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): NT_matmul_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") NT_matmul_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") model_decoder_layers_0_self_attn_q_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local") layer_norm356_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_3 in T.vectorized(T.int64(1)): with T.block("layer_norm356_shared"): v0, v1 = T.axis.remap("SS", [ax0, ax1]) v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3) T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280)) T.reads(layer_norm356[v0, v1, v2]) T.writes(layer_norm356_shared[v0, v1, v2]) layer_norm356_shared[v0, v1, v2] = layer_norm356[v0, v1, v2] for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) T.reads() T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax0_ax1_fused_0 in range(T.int64(4)): for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): with T.block("model_decoder_layers_0_self_attn_q_proj_weight5_local"): v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) T.reads(model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1]) T.writes(model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1]) model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1] for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) T.reads(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax2_fused_2_1 in T.vectorized(T.int64(1)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads() T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1 in range(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] for ax1_fused_2 in range(T.int64(1)): for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): with T.block("NT_matmul"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul[T.int64(0), T.int64(0), v0]) with T.init(): NT_matmul[T.int64(0), T.int64(0), v0] = T.float16(0) NT_matmul[T.int64(0), T.int64(0), v0] = NT_matmul[T.int64(0), T.int64(0), v0] + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] @T.prim_func def NT_matmul3(layer_norm452: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_embed_tokens_weight5: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), NT_matmul: T.Buffer((T.int64(1), T.int64(1), T.int64(51866)), "float32")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): NT_matmul_rf_local = T.alloc_buffer((T.int64(256), T.int64(1), T.int64(1), T.int64(51866)), scope="local") NT_matmul_rf_local_1 = T.alloc_buffer((T.int64(64), T.int64(1), T.int64(1), T.int64(51866)), scope="local") model_decoder_embed_tokens_weight5_local = T.alloc_buffer((T.int64(51866), T.int64(1280)), "float16", scope="local") layer_norm452_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(12967), thread="blockIdx.x"): for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): for ax2_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax2_2 in T.thread_binding(T.int64(64), thread="threadIdx.x"): for ax2_3 in T.vectorized(T.int64(1)): with T.block("layer_norm452_shared"): v0, v1 = T.axis.remap("SS", [ax0, ax1]) v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1 * T.int64(64) + ax2_2 + ax2_3) T.reads(layer_norm452[v0, v1, v2]) T.writes(layer_norm452_shared[v0, v1, v2]) layer_norm452_shared[v0, v1, v2] = layer_norm452[v0, v1, v2] for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init < T.int64(51866)) T.reads() T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float32(0) for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax0_ax1_fused_0 in range(T.int64(2)): for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): with T.block("model_decoder_embed_tokens_weight5_local"): v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1) v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 < T.int64(51866)) T.reads(model_decoder_embed_tokens_weight5[v0, v1]) T.writes(model_decoder_embed_tokens_weight5_local[v0, v1]) model_decoder_embed_tokens_weight5_local[v0, v1] = model_decoder_embed_tokens_weight5[v0, v1] for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) vax1_fused_u_fused_2, vax1_fused_u_fused_0 = T.axis.remap("RR", [ax1_fused_u_fused_2, ax1_fused_u_fused_0]) T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2 < T.int64(51866)) T.reads(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm452_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused], model_decoder_embed_tokens_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + T.Cast("float32", layer_norm452_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) * T.Cast("float32", model_decoder_embed_tokens_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax2_fused_2_1 in T.vectorized(T.int64(1)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(64), ax0) v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax2_fused_0_ax2_fused_1_fused % T.int64(4) + (ax2_fused_2_0 + ax2_fused_2_1)) < T.int64(51866)) T.reads() T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float32(0) for ax1 in range(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax2_fused_0_ax2_fused_1_fused % T.int64(4) + (ax2_fused_2_0 + ax2_fused_2_1)) < T.int64(51866)) T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] for ax1_fused_2 in range(T.int64(1)): for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): with T.block("NT_matmul"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(64), ax0) v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax1_fused_0_ax1_fused_1_fused % T.int64(4) + ax1_fused_2) < T.int64(51866)) T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul[T.int64(0), T.int64(0), v0]) with T.init(): NT_matmul[T.int64(0), T.int64(0), v0] = T.float32(0) NT_matmul[T.int64(0), T.int64(0), v0] = NT_matmul[T.int64(0), T.int64(0), v0] + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] @T.prim_func def add(var_reshape708: T.handle, var_reshape709: T.handle, var_T_add: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() reshape708 = T.match_buffer(var_reshape708, (batch_size, T.int64(1), T.int64(1280)), "float16") reshape709 = T.match_buffer(var_reshape709, (batch_size, T.int64(1), T.int64(1280)), "float16") T_add = T.match_buffer(var_T_add, (batch_size, T.int64(1), T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_add"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) T.reads(reshape708[v0, T.int64(0), v1], reshape709[v0, T.int64(0), v1]) T.writes(T_add[v0, T.int64(0), v1]) T_add[v0, T.int64(0), v1] = reshape708[v0, T.int64(0), v1] + reshape709[v0, T.int64(0), v1] @T.prim_func def add4(var_add: T.handle, var_lv610: T.handle, var_T_add: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() add = T.match_buffer(var_add, (batch_size, T.int64(1500), T.int64(1280)), "float16") lv610 = T.match_buffer(var_lv610, (batch_size, T.int64(1500), T.int64(1280)), "float16") T_add = T.match_buffer(var_T_add, (batch_size, T.int64(1500), T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_add"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) T.reads(add[v0, v1, v2], lv610[v0, v1, v2]) T.writes(T_add[v0, v1, v2]) T_add[v0, v1, v2] = add[v0, v1, v2] + lv610[v0, v1, v2] @T.prim_func def add5(var_reshape385: T.handle, var_reshape386: T.handle, var_T_add: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() reshape385 = T.match_buffer(var_reshape385, (T.int64(1), seq_len, T.int64(1280)), "float16") reshape386 = T.match_buffer(var_reshape386, (T.int64(1), seq_len, T.int64(1280)), "float16") T_add = T.match_buffer(var_T_add, (T.int64(1), seq_len, T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_add"): v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280)) T.reads(reshape385[T.int64(0), v0, v1], reshape386[T.int64(0), v0, v1]) T.writes(T_add[T.int64(0), v0, v1]) T_add[T.int64(0), v0, v1] = reshape385[T.int64(0), v0, v1] + reshape386[T.int64(0), v0, v1] @T.prim_func def apply_bitmask_inplace(var_logits: T.handle, var_seq_ids: T.handle, var_bitmask: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) logits = T.match_buffer(var_logits, (batch_size, vocab_size)) num_seq = T.int32(is_size_var=True) seq_ids = T.match_buffer(var_seq_ids, (num_seq,), "int32") bitmask = T.match_buffer(var_bitmask, (batch_size, (vocab_size + 31) // 32), "int32") # with T.block("root"): for fused_s_v_0 in T.thread_binding((num_seq * vocab_size + 1023) // 1024, thread="blockIdx.x"): for fused_s_v_1 in T.thread_binding(1024, thread="threadIdx.x"): with T.block("block"): vs = T.axis.spatial(num_seq, (fused_s_v_0 * 1024 + fused_s_v_1) // vocab_size) vv = T.axis.spatial(vocab_size, (fused_s_v_0 * 1024 + fused_s_v_1) % vocab_size) T.where(fused_s_v_0 * 1024 + fused_s_v_1 < num_seq * vocab_size) T.reads(bitmask[seq_ids[vs], vv // 32], seq_ids[vs], logits[seq_ids[vs], vv]) T.writes(logits[seq_ids[vs], vv]) logits[seq_ids[vs], vv] = T.if_then_else(T.bitwise_and(T.shift_right(bitmask[seq_ids[vs], vv // 32], vv % 32), 1) == 1, logits[seq_ids[vs], vv], T.float32(-3.4028234663852886e+38)) @T.prim_func def apply_logit_bias_inplace(var_logits: T.handle, var_pos2seq_id: T.handle, var_token_ids: T.handle, var_logit_bias: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) logits = T.match_buffer(var_logits, (batch_size, vocab_size)) num_token = T.int32(is_size_var=True) pos2seq_id = T.match_buffer(var_pos2seq_id, (num_token,), "int32") token_ids = T.match_buffer(var_token_ids, (num_token,), "int32") logit_bias = T.match_buffer(var_logit_bias, (num_token,)) # with T.block("root"): for p0 in T.thread_binding((num_token + 1023) // 1024, thread="blockIdx.x"): for p1 in T.thread_binding(1024, thread="threadIdx.x"): with T.block("block"): vp = T.axis.spatial(num_token, p0 * 1024 + p1) T.where(p0 * 1024 + p1 < num_token) T.reads(logits[pos2seq_id[vp], token_ids[vp]], pos2seq_id[vp], token_ids[vp], logit_bias[vp]) T.writes(logits[pos2seq_id[vp], token_ids[vp]]) logits[pos2seq_id[vp], token_ids[vp]] = logits[pos2seq_id[vp], token_ids[vp]] + logit_bias[vp] @T.prim_func def apply_penalty_inplace(var_logits: T.handle, var_seq_ids: T.handle, var_pos2seq_id: T.handle, var_token_ids: T.handle, var_token_cnt: T.handle, var_penalties: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) logits = T.match_buffer(var_logits, (batch_size, vocab_size)) num_seq = T.int32(is_size_var=True) seq_ids = T.match_buffer(var_seq_ids, (num_seq,), "int32") num_token = T.int32(is_size_var=True) pos2seq_id = T.match_buffer(var_pos2seq_id, (num_token,), "int32") token_ids = T.match_buffer(var_token_ids, (num_token,), "int32") token_cnt = T.match_buffer(var_token_cnt, (num_token,), "int32") penalties = T.match_buffer(var_penalties, (num_seq, 3)) # with T.block("root"): for p0 in T.thread_binding((num_token + 1023) // 1024, thread="blockIdx.x"): for p1 in T.thread_binding(1024, thread="threadIdx.x"): with T.block("block"): vp = T.axis.spatial(num_token, p0 * 1024 + p1) T.where(p0 * 1024 + p1 < num_token) T.reads(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]], seq_ids[pos2seq_id[vp]], pos2seq_id[vp], token_ids[vp], penalties[pos2seq_id[vp], 0:3], token_cnt[vp]) T.writes(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]]) logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] = logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] - (penalties[pos2seq_id[vp], 0] + T.Cast("float32", token_cnt[vp]) * penalties[pos2seq_id[vp], 1]) logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] = T.if_then_else(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] > T.float32(0), logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] * penalties[pos2seq_id[vp], 2], logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] / penalties[pos2seq_id[vp], 2]) @T.prim_func def argsort_thrust(var_probs: T.handle, var_lv: T.handle, var_topk_gpu_v1: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int64(), T.int64() data_buf = T.match_buffer(var_probs, (batch_size, vocab_size), align=8) workspace_buf = T.match_buffer(var_lv, (T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12),), "uint8", align=8) indices_buf = T.match_buffer(var_topk_gpu_v1, (batch_size, vocab_size), "int32", align=8) # with T.block("root"): value_buf = T.alloc_buffer((batch_size, vocab_size), align=8) with T.block("topk_gpu"): T.reads() T.writes() T.call_packed("tvm.contrib.thrust.sort", T.tvm_stack_make_array(data_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(value_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(indices_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, 0, T.int64(0)), 0, T.tvm_stack_make_array(workspace_buf.data, T.tvm_stack_make_shape(T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12)), 0, 1, T.uint8(0), T.int64(0))) @T.prim_func def batch_decode_paged_kv(_0: T.int32, Q_handle: T.handle, pages_handle: T.handle, page_table_indptr_handle: T.handle, page_table_values_handle: T.handle, var_length_info: T.handle, k_rope_pos_offset_handle: T.handle, q_rope_position_handle: T.handle, output_handle: T.handle, lse_handle: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) B = T.int32(is_size_var=True) Q = T.match_buffer(Q_handle, (B, 20, 64), "float16") max_num_pages = T.int32(is_size_var=True) pages = T.match_buffer(pages_handle, (max_num_pages, 2, 20, 16, 64), "float16") page_table_indptr = T.match_buffer(page_table_indptr_handle, (B + 1,), "int32", offset_factor=1) nnz_pages = T.int32(is_size_var=True) page_table_values = T.match_buffer(page_table_values_handle, (nnz_pages,), "int32", offset_factor=1) length_info = T.match_buffer(var_length_info, (B,), "int32", offset_factor=1) k_rope_pos_offset = T.match_buffer(k_rope_pos_offset_handle, (B,), "int32", offset_factor=1) q_rope_position = T.match_buffer(q_rope_position_handle, (B,), "int32", offset_factor=1) output = T.match_buffer(output_handle, (B, 20, 64), "float16") lse = T.match_buffer(lse_handle, (B, 20)) # with T.block("root"): sm_scale: T.float32 = T.float32(0.18033688011112042) for bx in T.thread_binding(B, thread="blockIdx.x"): for fused_by_bz in T.thread_binding(20, thread="blockIdx.y"): for ty in T.thread_binding(1, thread="threadIdx.y"): for tx in T.thread_binding(16, thread="threadIdx.x"): for tz in T.thread_binding(32, thread="threadIdx.z"): with T.block("attn"): T.reads(page_table_indptr[bx:bx + 2], length_info[bx], q_rope_position[bx], Q[bx, fused_by_bz // 20 + ty + fused_by_bz % 20, tx * 4 - 32:tx * 4 - 32 + 68]) T.writes(output[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty, tx * 4:tx * 4 + 4], lse[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty]) Q_local = T.alloc_buffer((4,), "float16", scope="local") kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") K_smem = T.alloc_buffer((64, 64), "float16", scope="shared") V_smem = T.alloc_buffer((64, 64), "float16", scope="shared") O_allreduce = T.alloc_buffer((32, 1, 64), scope="shared") md_allreduce = T.alloc_buffer((32, 1, 2), scope="shared") S_reduce_local = T.alloc_buffer((1,), scope="local") t0 = T.alloc_buffer((1,), scope="local") S_local = T.alloc_buffer((2,), scope="local") QK_local = T.alloc_buffer((4,), scope="local") V_local = T.alloc_buffer((4,), "float16", scope="local") m_prev = T.alloc_buffer((1,), scope="local") d_prev = T.alloc_buffer((1,), scope="local") other_m = T.alloc_buffer((1,), scope="local") other_d = T.alloc_buffer((1,), scope="local") exp_mprev = T.alloc_buffer((1,), scope="local") exp_otherm = T.alloc_buffer((1,), scope="local") other_o = T.alloc_buffer((4,), scope="local") st_m = T.alloc_buffer((1,), scope="local") st_d = T.alloc_buffer((1,), scope="local") O_local = T.alloc_buffer((4,), scope="local") by: T.int32 = fused_by_bz % 20 bz: T.int32 = fused_by_bz // 20 batch_idx: T.int32 = bx cur_page_indptr_begin: T.int32 = page_table_indptr[batch_idx] cur_page_indptr_end: T.int32 = page_table_indptr[batch_idx + 1] kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[batch_idx], 0) st_m[0] = T.float32(-50000) st_d[0] = T.float32(1) for vec in T.vectorized(4): O_local[vec] = T.float32(0) for vec in T.vectorized(4): Q_local[vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", Q[bx, by + bz + ty, tx * 4 + vec]) + T.sin(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, Q[bx, by + bz + ty, tx * 4 + vec + 32] * T.float16(-1), Q[bx, by + bz + ty, tx * 4 + vec - 32]))), Q[bx, by + bz + ty, tx * 4 + vec]) for iterator in range((kv_chunk_len[0] + 63) // 64): tile_start_s: T.int32 = (tz + ty) * 2 tile_start_g: T.int32 = (iterator * 32 + tz + ty) * 2 for j in range(2): with T.block("KV_load"): T.reads() T.writes() row_g: T.int32 = tile_start_g + j if row_g < kv_chunk_len[0]: seq_offset: T.int32 = row_g page_no: T.int32 = page_table_values[cur_page_indptr_begin + seq_offset // 16] page_offset: T.int32 = seq_offset % 16 for vec in T.vectorized(4): K_smem[tile_start_s + j, tx * 4 + vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, tx * 4 + vec]) + T.sin(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, pages[page_no, 0, by, page_offset, tx * 4 + vec + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, tx * 4 + vec - 32]))), pages[page_no, 0, by, page_offset, tx * 4 + vec]) V_smem[tile_start_s + j, tx * 4 + vec] = pages[page_no, 1, by, page_offset, tx * 4 + vec] else: for vec in T.vectorized(4): K_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) V_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) T.tvm_storage_sync("shared") m_prev[0] = st_m[0] for j in range(2): for vec in T.vectorized(4): QK_local[vec] = T.Cast("float32", Q_local[vec]) * T.Cast("float32", K_smem[tz * 2 + j, tx * 4 + vec]) * attn_score_scaling_factor * sm_scale S_reduce_local[0] = T.float32(0) for vec in T.unroll(4): S_reduce_local[0] = S_reduce_local[0] + QK_local[vec] with T.block("block_cross_thread"): T.reads(S_reduce_local[0]) T.writes(t0[0]) T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) T.tvm_thread_allreduce(T.uint32(1), S_reduce_local[0], T.bool(True), t0[0], tx) S_local[j] = T.float32(-50000) if (iterator * 32 + tz) * 2 + j < kv_chunk_len[0]: S_local[j] = t0[0] st_m[0] = T.max(st_m[0], S_local[j]) o_scale: T.float32 = T.exp2(m_prev[0] - st_m[0]) st_d[0] = st_d[0] * o_scale for j in range(2): S_local[j] = T.exp2(S_local[j] - st_m[0]) st_d[0] = st_d[0] + S_local[j] for j in T.vectorized(4): O_local[j] = O_local[j] * o_scale for j in range(2): for vec in T.vectorized(4): V_local[vec] = V_smem[tz * 2 + j, tx * 4 + vec] for vec in T.vectorized(4): O_local[vec] = O_local[vec] + T.Cast("float32", V_local[vec]) * S_local[j] for vec in T.vectorized(4): O_allreduce[tz, ty, tx * 4 + vec] = O_local[vec] md_allreduce[tz, ty, 0] = st_m[0] md_allreduce[tz, ty, 1] = st_d[0] T.tvm_storage_sync("shared") st_m[0] = T.float32(-50000) st_d[0] = T.float32(1) for vec in T.vectorized(4): O_local[vec] = T.float32(0) for j in range(32): m_prev[0] = st_m[0] d_prev[0] = st_d[0] other_m[0] = md_allreduce[j, ty, 0] other_d[0] = md_allreduce[j, ty, 1] for vec in T.vectorized(4): other_o[vec] = O_allreduce[j, ty, tx * 4 + vec] st_m[0] = T.max(st_m[0], other_m[0]) st_d[0] = d_prev[0] * T.exp2(m_prev[0] - st_m[0]) + other_d[0] * T.exp2(other_m[0] - st_m[0]) exp_mprev[0] = T.exp2(m_prev[0] - st_m[0]) exp_otherm[0] = T.exp2(other_m[0] - st_m[0]) for vec in T.vectorized(4): O_local[vec] = O_local[vec] * exp_mprev[0] + other_o[vec] * exp_otherm[0] for vec in T.vectorized(4): O_local[vec] = O_local[vec] / st_d[0] for vec in T.vectorized(4): output[batch_idx, by + bz + ty, tx * 4 + vec] = T.Cast("float16", O_local[vec]) lse[batch_idx, by + bz + ty] = st_m[0] + T.log2(st_d[0]) @T.prim_func def batch_decode_paged_kv_sliding_window(_0: T.int32, Q_handle: T.handle, pages_handle: T.handle, page_table_indptr_handle: T.handle, page_table_values_handle: T.handle, var_length_info: T.handle, k_rope_pos_offset_handle: T.handle, q_rope_position_handle: T.handle, output_handle: T.handle, lse_handle: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) B = T.int32(is_size_var=True) Q = T.match_buffer(Q_handle, (B, 20, 64), "float16") max_num_pages = T.int32(is_size_var=True) pages = T.match_buffer(pages_handle, (max_num_pages, 2, 20, 16, 64), "float16") page_table_indptr = T.match_buffer(page_table_indptr_handle, (B + 1,), "int32", offset_factor=1) nnz_pages = T.int32(is_size_var=True) page_table_values = T.match_buffer(page_table_values_handle, (nnz_pages,), "int32", offset_factor=1) length_info = T.match_buffer(var_length_info, (3, B), "int32", offset_factor=1) k_rope_pos_offset = T.match_buffer(k_rope_pos_offset_handle, (B,), "int32", offset_factor=1) q_rope_position = T.match_buffer(q_rope_position_handle, (B,), "int32", offset_factor=1) output = T.match_buffer(output_handle, (B, 20, 64), "float16") lse = T.match_buffer(lse_handle, (B, 20)) # with T.block("root"): sm_scale: T.float32 = T.float32(0.18033688011112042) for bx in T.thread_binding(B, thread="blockIdx.x"): for fused_by_bz in T.thread_binding(20, thread="blockIdx.y"): for ty in T.thread_binding(1, thread="threadIdx.y"): for tx in T.thread_binding(16, thread="threadIdx.x"): for tz in T.thread_binding(32, thread="threadIdx.z"): with T.block("attn"): T.reads(page_table_indptr[bx:bx + 2], length_info[0:3, bx], q_rope_position[bx], Q[bx, fused_by_bz // 20 + ty + fused_by_bz % 20, tx * 4 - 32:tx * 4 - 32 + 68]) T.writes(output[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty, tx * 4:tx * 4 + 4], lse[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty]) Q_local = T.alloc_buffer((4,), "float16", scope="local") kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") K_smem = T.alloc_buffer((64, 64), "float16", scope="shared") V_smem = T.alloc_buffer((64, 64), "float16", scope="shared") O_allreduce = T.alloc_buffer((32, 1, 64), scope="shared") md_allreduce = T.alloc_buffer((32, 1, 2), scope="shared") S_reduce_local = T.alloc_buffer((1,), scope="local") t0 = T.alloc_buffer((1,), scope="local") S_local = T.alloc_buffer((2,), scope="local") QK_local = T.alloc_buffer((4,), scope="local") V_local = T.alloc_buffer((4,), "float16", scope="local") m_prev = T.alloc_buffer((1,), scope="local") d_prev = T.alloc_buffer((1,), scope="local") other_m = T.alloc_buffer((1,), scope="local") other_d = T.alloc_buffer((1,), scope="local") exp_mprev = T.alloc_buffer((1,), scope="local") exp_otherm = T.alloc_buffer((1,), scope="local") other_o = T.alloc_buffer((4,), scope="local") st_m = T.alloc_buffer((1,), scope="local") st_d = T.alloc_buffer((1,), scope="local") O_local = T.alloc_buffer((4,), scope="local") by: T.int32 = fused_by_bz % 20 bz: T.int32 = fused_by_bz // 20 batch_idx: T.int32 = bx cur_page_indptr_begin: T.int32 = page_table_indptr[batch_idx] cur_page_indptr_end: T.int32 = page_table_indptr[batch_idx + 1] kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[0, batch_idx] - length_info[1, batch_idx] + length_info[2, batch_idx], 0) st_m[0] = T.float32(-50000) st_d[0] = T.float32(1) for vec in T.vectorized(4): O_local[vec] = T.float32(0) for vec in T.vectorized(4): Q_local[vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", Q[bx, by + bz + ty, tx * 4 + vec]) + T.sin(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, Q[bx, by + bz + ty, tx * 4 + vec + 32] * T.float16(-1), Q[bx, by + bz + ty, tx * 4 + vec - 32]))), Q[bx, by + bz + ty, tx * 4 + vec]) for iterator in range((kv_chunk_len[0] + 63) // 64): tile_start_s: T.int32 = (tz + ty) * 2 tile_start_g: T.int32 = (iterator * 32 + tz + ty) * 2 for j in range(2): with T.block("KV_load"): T.reads() T.writes() row_g: T.int32 = tile_start_g + j if row_g < kv_chunk_len[0]: seq_offset: T.int32 = T.if_then_else(row_g < length_info[2, batch_idx], row_g, row_g - length_info[2, batch_idx] + length_info[1, batch_idx]) page_no: T.int32 = page_table_values[cur_page_indptr_begin + seq_offset // 16] page_offset: T.int32 = seq_offset % 16 for vec in T.vectorized(4): K_smem[tile_start_s + j, tx * 4 + vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, tx * 4 + vec]) + T.sin(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, pages[page_no, 0, by, page_offset, tx * 4 + vec + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, tx * 4 + vec - 32]))), pages[page_no, 0, by, page_offset, tx * 4 + vec]) V_smem[tile_start_s + j, tx * 4 + vec] = pages[page_no, 1, by, page_offset, tx * 4 + vec] else: for vec in T.vectorized(4): K_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) V_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) T.tvm_storage_sync("shared") m_prev[0] = st_m[0] for j in range(2): for vec in T.vectorized(4): QK_local[vec] = T.Cast("float32", Q_local[vec]) * T.Cast("float32", K_smem[tz * 2 + j, tx * 4 + vec]) * attn_score_scaling_factor * sm_scale S_reduce_local[0] = T.float32(0) for vec in T.unroll(4): S_reduce_local[0] = S_reduce_local[0] + QK_local[vec] with T.block("block_cross_thread"): T.reads(S_reduce_local[0]) T.writes(t0[0]) T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) T.tvm_thread_allreduce(T.uint32(1), S_reduce_local[0], T.bool(True), t0[0], tx) S_local[j] = T.float32(-50000) if (iterator * 32 + tz) * 2 + j < kv_chunk_len[0]: S_local[j] = t0[0] st_m[0] = T.max(st_m[0], S_local[j]) o_scale: T.float32 = T.exp2(m_prev[0] - st_m[0]) st_d[0] = st_d[0] * o_scale for j in range(2): S_local[j] = T.exp2(S_local[j] - st_m[0]) st_d[0] = st_d[0] + S_local[j] for j in T.vectorized(4): O_local[j] = O_local[j] * o_scale for j in range(2): for vec in T.vectorized(4): V_local[vec] = V_smem[tz * 2 + j, tx * 4 + vec] for vec in T.vectorized(4): O_local[vec] = O_local[vec] + T.Cast("float32", V_local[vec]) * S_local[j] for vec in T.vectorized(4): O_allreduce[tz, ty, tx * 4 + vec] = O_local[vec] md_allreduce[tz, ty, 0] = st_m[0] md_allreduce[tz, ty, 1] = st_d[0] T.tvm_storage_sync("shared") st_m[0] = T.float32(-50000) st_d[0] = T.float32(1) for vec in T.vectorized(4): O_local[vec] = T.float32(0) for j in range(32): m_prev[0] = st_m[0] d_prev[0] = st_d[0] other_m[0] = md_allreduce[j, ty, 0] other_d[0] = md_allreduce[j, ty, 1] for vec in T.vectorized(4): other_o[vec] = O_allreduce[j, ty, tx * 4 + vec] st_m[0] = T.max(st_m[0], other_m[0]) st_d[0] = d_prev[0] * T.exp2(m_prev[0] - st_m[0]) + other_d[0] * T.exp2(other_m[0] - st_m[0]) exp_mprev[0] = T.exp2(m_prev[0] - st_m[0]) exp_otherm[0] = T.exp2(other_m[0] - st_m[0]) for vec in T.vectorized(4): O_local[vec] = O_local[vec] * exp_mprev[0] + other_o[vec] * exp_otherm[0] for vec in T.vectorized(4): O_local[vec] = O_local[vec] / st_d[0] for vec in T.vectorized(4): output[batch_idx, by + bz + ty, tx * 4 + vec] = T.Cast("float16", O_local[vec]) lse[batch_idx, by + bz + ty] = st_m[0] + T.log2(st_d[0]) @T.prim_func def batch_prefill_paged_kv(_0: T.int32, var_q: T.handle, var_q_indptr: T.handle, var_pages: T.handle, var_page_indptr: T.handle, var_page_values: T.handle, var_length_info: T.handle, var_k_rope_pos_offset: T.handle, var_q_rope_position: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) total_len = T.int32(is_size_var=True) q = T.match_buffer(var_q, (total_len, 20, 64), "float16") batch_size = T.int32(is_size_var=True) q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) max_num_pages = T.int32(is_size_var=True) pages = T.match_buffer(var_pages, (max_num_pages, 2, 20, 16, 64), "float16") page_indptr = T.match_buffer(var_page_indptr, (batch_size + 1,), "int32", offset_factor=1) nnz_pages = T.int32(is_size_var=True) page_values = T.match_buffer(var_page_values, (nnz_pages,), "int32", offset_factor=1) length_info = T.match_buffer(var_length_info, (batch_size,), "int32", offset_factor=1) k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1) q_rope_position = T.match_buffer(var_q_rope_position, (total_len,), "int32", offset_factor=1) output = T.match_buffer(var_output, (total_len, 20, 64), "float16") lse = T.match_buffer(var_lse, (total_len, 20)) # with T.block("root"): for lbx in T.thread_binding(16, thread="blockIdx.x"): for lby in T.thread_binding(20, thread="blockIdx.y"): for lty in T.thread_binding(4, thread="threadIdx.y"): for ltx in T.thread_binding(32, thread="threadIdx.x"): with T.block("attn"): bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) T.reads() T.writes() tile_id = T.alloc_buffer((1,), "int32", scope="local") batch_idx = T.alloc_buffer((1,), "int32", scope="local") batch_tiles = T.alloc_buffer((1,), "int32", scope="local") batch_rows = T.alloc_buffer((1,), "int32", scope="local") iterator = T.alloc_buffer((1,), "int32", scope="local") kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") S_smem = T.alloc_buffer((32, 16), scope="shared") S_local = T.alloc_buffer((32, 16), scope="local") O_local = T.alloc_buffer((32, 64), scope="local") m_smem = T.alloc_buffer((32,), scope="shared") m_prev_smem = T.alloc_buffer((32,), scope="shared") d_smem = T.alloc_buffer((32,), scope="shared") m_new = T.alloc_buffer((1,), scope="local") m_prev = T.alloc_buffer((1,), scope="local") d_new = T.alloc_buffer((1,), scope="local") tile_id[0] = bx batch_idx[0] = 0 batch_rows[0] = q_indptr[1] - q_indptr[0] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 while T.tvm_thread_invariant(batch_idx[0] < batch_size): while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: tile_id[0] = tile_id[0] - batch_tiles[0] batch_idx[0] = batch_idx[0] + 1 if batch_idx[0] < batch_size: b_idx: T.int32 = batch_idx[0] batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 if T.tvm_thread_invariant(batch_idx[0] < batch_size): b_idx: T.int32 = batch_idx[0] LH_start: T.int32 = tile_id[0] * 32 q_indptr_val: T.int32 = q_indptr[b_idx] cur_page_indptr_begin: T.int32 = page_indptr[b_idx] cur_page_indptr_end: T.int32 = page_indptr[b_idx + 1] kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[b_idx], 0) T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: m_smem[row] = T.float32(-50000) d_smem[row] = T.float32(1) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads() T.writes(O_local[i, j]) O_local[i, j] = T.float32(0) T.tvm_storage_sync("shared") for li_lj_fused_0 in range(4): for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for li_lj_fused_3 in T.vectorized(4): with T.block("Q_load"): i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = q_indptr_val + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j]) else: Q_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") for iterator_1 in range((kv_chunk_len[0] + 15) // 16): L_kv_start: T.int32 = iterator_1 * 16 for lz_ly_fused_0 in range(2): for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for lz_ly_fused_3 in T.vectorized(4): with T.block("K_load"): i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = L_kv_start + i if cur_L < kv_chunk_len[0]: seq_offset: T.int32 = cur_L page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] page_offset: T.int32 = seq_offset % 16 K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, pages[page_no, 0, by, page_offset, j + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, j - 32]))), pages[page_no, 0, by, page_offset, j]) else: K_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") for lz_ly_fused_0 in range(2): for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for lz_ly_fused_3 in T.vectorized(4): with T.block("V_load"): i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = L_kv_start + i if cur_L < kv_chunk_len[0]: seq_offset: T.int32 = cur_L page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] page_offset: T.int32 = seq_offset % 16 V_smem[i, j] = pages[page_no, 1, by, page_offset, j] else: V_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") with T.block(""): T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) T.writes(S_local[0:32, 0:16]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(2, 2): with T.block("S_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) T.reads() T.writes(S_local[i, j]) S_local[i, j] = T.float32(0) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): with T.block("S_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) k = T.axis.reduce(64, lk_0 * 8 + lk_1) T.reads(S_local[i, j], Q_smem[i, k], K_smem[j, k]) T.writes(S_local[i, j]) S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k]) * T.Cast("float32", K_smem[j, k]) * attn_score_scaling_factor * T.float32(0.18033688011112042) T.tvm_storage_sync("shared") for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(2, 2): with T.block("S_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) T.reads(S_local[i, j]) T.writes(S_smem[i, j]) S_smem[i, j] = S_local[i, j] T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update1"): T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) T.writes(m_prev[i], m_new[i], d_new[i]) m_prev[i] = m_smem[row] m_new[i] = m_smem[row] row_: T.int32 = LH_start + row for j in range(16): if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): m_new[i] = T.max(m_new[i], S_smem[row, j]) d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx with T.block("update"): T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) T.writes(S_smem[row, 0:16]) for j in range(16): if row < 32: row_: T.int32 = LH_start + row if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) else: S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update"): T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) for j in range(16): d_new[i] = d_new[i] + S_smem[row, j] m_smem[row] = m_new[i] d_smem[row] = d_new[i] m_prev_smem[row] = m_prev[i] T.tvm_storage_sync("shared") with T.block(""): T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) T.writes(O_local[0:32, 0:64]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(4, 4): with T.block("O_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) T.reads() T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): with T.block("O_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) k = T.axis.reduce(16, lk_0 * 8 + lk_1) T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k], V_smem[k, j]) T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] + S_smem[i, k] * T.Cast("float32", V_smem[k, j]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) for li_0 in range(1): for li_1 in T.thread_binding(4, thread="threadIdx.y"): for li_2 in T.thread_binding(32, thread="threadIdx.x"): with T.block("lse_store"): i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) tile_id[0] = tile_id[0] + 16 @T.prim_func def batch_prefill_paged_kv_sliding_window(_0: T.int32, var_q: T.handle, var_q_indptr: T.handle, var_pages: T.handle, var_page_indptr: T.handle, var_page_values: T.handle, var_length_info: T.handle, var_k_rope_pos_offset: T.handle, var_q_rope_position: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) total_len = T.int32(is_size_var=True) q = T.match_buffer(var_q, (total_len, 20, 64), "float16") batch_size = T.int32(is_size_var=True) q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) max_num_pages = T.int32(is_size_var=True) pages = T.match_buffer(var_pages, (max_num_pages, 2, 20, 16, 64), "float16") page_indptr = T.match_buffer(var_page_indptr, (batch_size + 1,), "int32", offset_factor=1) nnz_pages = T.int32(is_size_var=True) page_values = T.match_buffer(var_page_values, (nnz_pages,), "int32", offset_factor=1) length_info = T.match_buffer(var_length_info, (3, batch_size), "int32", offset_factor=1) k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1) q_rope_position = T.match_buffer(var_q_rope_position, (total_len,), "int32", offset_factor=1) output = T.match_buffer(var_output, (total_len, 20, 64), "float16") lse = T.match_buffer(var_lse, (total_len, 20)) # with T.block("root"): for lbx in T.thread_binding(16, thread="blockIdx.x"): for lby in T.thread_binding(20, thread="blockIdx.y"): for lty in T.thread_binding(4, thread="threadIdx.y"): for ltx in T.thread_binding(32, thread="threadIdx.x"): with T.block("attn"): bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) T.reads() T.writes() tile_id = T.alloc_buffer((1,), "int32", scope="local") batch_idx = T.alloc_buffer((1,), "int32", scope="local") batch_tiles = T.alloc_buffer((1,), "int32", scope="local") batch_rows = T.alloc_buffer((1,), "int32", scope="local") iterator = T.alloc_buffer((1,), "int32", scope="local") kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") S_smem = T.alloc_buffer((32, 16), scope="shared") S_local = T.alloc_buffer((32, 16), scope="local") O_local = T.alloc_buffer((32, 64), scope="local") m_smem = T.alloc_buffer((32,), scope="shared") m_prev_smem = T.alloc_buffer((32,), scope="shared") d_smem = T.alloc_buffer((32,), scope="shared") m_new = T.alloc_buffer((1,), scope="local") m_prev = T.alloc_buffer((1,), scope="local") d_new = T.alloc_buffer((1,), scope="local") tile_id[0] = bx batch_idx[0] = 0 batch_rows[0] = q_indptr[1] - q_indptr[0] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 while T.tvm_thread_invariant(batch_idx[0] < batch_size): while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: tile_id[0] = tile_id[0] - batch_tiles[0] batch_idx[0] = batch_idx[0] + 1 if batch_idx[0] < batch_size: b_idx: T.int32 = batch_idx[0] batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 if T.tvm_thread_invariant(batch_idx[0] < batch_size): b_idx: T.int32 = batch_idx[0] LH_start: T.int32 = tile_id[0] * 32 q_indptr_val: T.int32 = q_indptr[b_idx] cur_page_indptr_begin: T.int32 = page_indptr[b_idx] cur_page_indptr_end: T.int32 = page_indptr[b_idx + 1] kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[0, b_idx] - length_info[1, b_idx] + length_info[2, b_idx], 0) T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: m_smem[row] = T.float32(-50000) d_smem[row] = T.float32(1) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads() T.writes(O_local[i, j]) O_local[i, j] = T.float32(0) T.tvm_storage_sync("shared") for li_lj_fused_0 in range(4): for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for li_lj_fused_3 in T.vectorized(4): with T.block("Q_load"): i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = q_indptr_val + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j]) else: Q_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") for iterator_1 in range((kv_chunk_len[0] + 15) // 16): L_kv_start: T.int32 = iterator_1 * 16 for lz_ly_fused_0 in range(2): for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for lz_ly_fused_3 in T.vectorized(4): with T.block("K_load"): i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = L_kv_start + i if cur_L < kv_chunk_len[0]: seq_offset: T.int32 = T.if_then_else(cur_L < length_info[2, b_idx], cur_L, cur_L - length_info[2, b_idx] + length_info[1, b_idx]) page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] page_offset: T.int32 = seq_offset % 16 K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, pages[page_no, 0, by, page_offset, j + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, j - 32]))), pages[page_no, 0, by, page_offset, j]) else: K_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") for lz_ly_fused_0 in range(2): for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for lz_ly_fused_3 in T.vectorized(4): with T.block("V_load"): i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = L_kv_start + i if cur_L < kv_chunk_len[0]: seq_offset: T.int32 = T.if_then_else(cur_L < length_info[2, b_idx], cur_L, cur_L - length_info[2, b_idx] + length_info[1, b_idx]) page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] page_offset: T.int32 = seq_offset % 16 V_smem[i, j] = pages[page_no, 1, by, page_offset, j] else: V_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") with T.block(""): T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) T.writes(S_local[0:32, 0:16]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(2, 2): with T.block("S_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) T.reads() T.writes(S_local[i, j]) S_local[i, j] = T.float32(0) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): with T.block("S_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) k = T.axis.reduce(64, lk_0 * 8 + lk_1) T.reads(S_local[i, j], Q_smem[i, k], K_smem[j, k]) T.writes(S_local[i, j]) S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k]) * T.Cast("float32", K_smem[j, k]) * attn_score_scaling_factor * T.float32(0.18033688011112042) T.tvm_storage_sync("shared") for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(2, 2): with T.block("S_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) T.reads(S_local[i, j]) T.writes(S_smem[i, j]) S_smem[i, j] = S_local[i, j] T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update1"): T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) T.writes(m_prev[i], m_new[i], d_new[i]) m_prev[i] = m_smem[row] m_new[i] = m_smem[row] row_: T.int32 = LH_start + row for j in range(16): if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): m_new[i] = T.max(m_new[i], S_smem[row, j]) d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx with T.block("update"): T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) T.writes(S_smem[row, 0:16]) for j in range(16): if row < 32: row_: T.int32 = LH_start + row if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) else: S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update"): T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) for j in range(16): d_new[i] = d_new[i] + S_smem[row, j] m_smem[row] = m_new[i] d_smem[row] = d_new[i] m_prev_smem[row] = m_prev[i] T.tvm_storage_sync("shared") with T.block(""): T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) T.writes(O_local[0:32, 0:64]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(4, 4): with T.block("O_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) T.reads() T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): with T.block("O_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) k = T.axis.reduce(16, lk_0 * 8 + lk_1) T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k], V_smem[k, j]) T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] + S_smem[i, k] * T.Cast("float32", V_smem[k, j]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) for li_0 in range(1): for li_1 in T.thread_binding(4, thread="threadIdx.y"): for li_2 in T.thread_binding(32, thread="threadIdx.x"): with T.block("lse_store"): i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) tile_id[0] = tile_id[0] + 16 @T.prim_func def batch_prefill_ragged_kv(var_q: T.handle, var_q_indptr: T.handle, var_k: T.handle, var_v: T.handle, var_kv_indptr: T.handle, var_q_rope_position: T.handle, var_k_rope_pos_offset: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) qo_len = T.int32(is_size_var=True) q = T.match_buffer(var_q, (qo_len, 20, 64), "float16") batch_size = T.int32(is_size_var=True) q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) kv_len = T.int32(is_size_var=True) k = T.match_buffer(var_k, (kv_len, 20, 64), "float16") v = T.match_buffer(var_v, (kv_len, 20, 64), "float16") kv_indptr = T.match_buffer(var_kv_indptr, (batch_size + 1,), "int32", offset_factor=1) q_rope_position = T.match_buffer(var_q_rope_position, (qo_len,), "int32", offset_factor=1) k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1) output = T.match_buffer(var_output, (qo_len, 20, 64), "float16") lse = T.match_buffer(var_lse, (qo_len, 20)) # with T.block("root"): for lbx in T.thread_binding(16, thread="blockIdx.x"): for lby in T.thread_binding(20, thread="blockIdx.y"): for lty in T.thread_binding(4, thread="threadIdx.y"): for ltx in T.thread_binding(32, thread="threadIdx.x"): with T.block("attn"): bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) T.reads() T.writes() tile_id = T.alloc_buffer((1,), "int32", scope="local") batch_idx = T.alloc_buffer((1,), "int32", scope="local") batch_tiles = T.alloc_buffer((1,), "int32", scope="local") batch_rows = T.alloc_buffer((1,), "int32", scope="local") iterator = T.alloc_buffer((1,), "int32", scope="local") kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") S_smem = T.alloc_buffer((32, 16), scope="shared") S_local = T.alloc_buffer((32, 16), scope="local") O_local = T.alloc_buffer((32, 64), scope="local") m_smem = T.alloc_buffer((32,), scope="shared") m_prev_smem = T.alloc_buffer((32,), scope="shared") d_smem = T.alloc_buffer((32,), scope="shared") m_new = T.alloc_buffer((1,), scope="local") m_prev = T.alloc_buffer((1,), scope="local") d_new = T.alloc_buffer((1,), scope="local") tile_id[0] = bx batch_idx[0] = 0 batch_rows[0] = q_indptr[1] - q_indptr[0] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 while T.tvm_thread_invariant(batch_idx[0] < batch_size): while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: tile_id[0] = tile_id[0] - batch_tiles[0] batch_idx[0] = batch_idx[0] + 1 if batch_idx[0] < batch_size: b_idx: T.int32 = batch_idx[0] batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 if T.tvm_thread_invariant(batch_idx[0] < batch_size): b_idx: T.int32 = batch_idx[0] q_indptr_val: T.int32 = q_indptr[b_idx] LH_start: T.int32 = tile_id[0] * 32 kv_chunk_len[0] = kv_indptr[b_idx + 1] - kv_indptr[b_idx] T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: m_smem[row] = T.float32(-50000) d_smem[row] = T.float32(1) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads() T.writes(O_local[i, j]) O_local[i, j] = T.float32(0) T.tvm_storage_sync("shared") for li_lj_fused_0 in range(4): for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for li_lj_fused_3 in T.vectorized(4): with T.block("Q_load"): i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = q_indptr_val + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j]) else: Q_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") for iterator_1 in range((kv_chunk_len[0] + 15) // 16): L_kv_start: T.int32 = iterator_1 * 16 L_kv_base: T.int32 = kv_indptr[b_idx] for lz_ly_fused_0 in range(2): for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for lz_ly_fused_3 in T.vectorized(4): with T.block("K_load"): i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = L_kv_start + i if cur_L < kv_chunk_len[0]: K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", k[L_kv_base + cur_L, by, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, k[L_kv_base + cur_L, by, j + 32] * T.float16(-1), k[L_kv_base + cur_L, by, j - 32]))), k[L_kv_base + cur_L, by, j]) else: K_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") for lz_ly_fused_0 in range(2): for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for lz_ly_fused_3 in T.vectorized(4): with T.block("V_load"): i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = L_kv_start + i if cur_L < kv_chunk_len[0]: V_smem[i, j] = v[L_kv_base + cur_L, by, j] else: V_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") with T.block(""): T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) T.writes(S_local[0:32, 0:16]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(2, 2): with T.block("S_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) T.reads() T.writes(S_local[i, j]) S_local[i, j] = T.float32(0) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): with T.block("S_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) k_1 = T.axis.reduce(64, lk_0 * 8 + lk_1) T.reads(S_local[i, j], Q_smem[i, k_1], K_smem[j, k_1]) T.writes(S_local[i, j]) S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k_1]) * T.Cast("float32", K_smem[j, k_1]) * attn_score_scaling_factor * T.float32(0.18033688011112042) T.tvm_storage_sync("shared") for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(2, 2): with T.block("S_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) T.reads(S_local[i, j]) T.writes(S_smem[i, j]) S_smem[i, j] = S_local[i, j] T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update1"): T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) T.writes(m_prev[i], m_new[i], d_new[i]) m_prev[i] = m_smem[row] m_new[i] = m_smem[row] row_: T.int32 = LH_start + row for j in range(16): if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): m_new[i] = T.max(m_new[i], S_smem[row, j]) d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx with T.block("update"): T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) T.writes(S_smem[row, 0:16]) for j in range(16): if row < 32: row_: T.int32 = LH_start + row if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) else: S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update"): T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) for j in range(16): d_new[i] = d_new[i] + S_smem[row, j] m_smem[row] = m_new[i] d_smem[row] = d_new[i] m_prev_smem[row] = m_prev[i] T.tvm_storage_sync("shared") with T.block(""): T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) T.writes(O_local[0:32, 0:64]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(4, 4): with T.block("O_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) T.reads() T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): with T.block("O_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) k_1 = T.axis.reduce(16, lk_0 * 8 + lk_1) T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k_1], V_smem[k_1, j]) T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] + S_smem[i, k_1] * T.Cast("float32", V_smem[k_1, j]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) for li_0 in range(1): for li_1 in T.thread_binding(4, thread="threadIdx.y"): for li_2 in T.thread_binding(32, thread="threadIdx.x"): with T.block("lse_store"): i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) tile_id[0] = tile_id[0] + 16 @T.prim_func def batch_tree_attn(var_q: T.handle, var_q_indptr: T.handle, var_k: T.handle, var_v: T.handle, var_kv_indptr: T.handle, var_q_rope_position: T.handle, var_mn_indptr: T.handle, var_mask: T.handle, var_output: T.handle, var_lse: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32, batch_size: T.int32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) qo_len = T.int32(is_size_var=True) q = T.match_buffer(var_q, (qo_len, 20, 64), "float16") q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) kv_len = T.int32(is_size_var=True) k = T.match_buffer(var_k, (kv_len, 20, 64), "float16") v = T.match_buffer(var_v, (kv_len, 20, 64), "float16") kv_indptr = T.match_buffer(var_kv_indptr, (batch_size + 1,), "int32", offset_factor=1) q_rope_position = T.match_buffer(var_q_rope_position, (qo_len,), "int32", offset_factor=1) mn_indptr = T.match_buffer(var_mn_indptr, (batch_size + 1,), "int32", offset_factor=1) tree_size = T.int32(is_size_var=True) mask = T.match_buffer(var_mask, (tree_size,), "int32", offset_factor=1) output = T.match_buffer(var_output, (qo_len, 20, 64), "float16") lse = T.match_buffer(var_lse, (qo_len, 20)) # with T.block("root"): for lbx in T.thread_binding(16, thread="blockIdx.x"): for lby in T.thread_binding(20, thread="blockIdx.y"): for lty in T.thread_binding(4, thread="threadIdx.y"): for ltx in T.thread_binding(32, thread="threadIdx.x"): with T.block("attn"): bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) T.reads() T.writes() tile_id = T.alloc_buffer((1,), "int32", scope="local") batch_idx = T.alloc_buffer((1,), "int32", scope="local") batch_tiles = T.alloc_buffer((1,), "int32", scope="local") batch_rows = T.alloc_buffer((1,), "int32", scope="local") iterator = T.alloc_buffer((1,), "int32", scope="local") kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") S_smem = T.alloc_buffer((32, 16), scope="shared") S_local = T.alloc_buffer((32, 16), scope="local") O_local = T.alloc_buffer((32, 64), scope="local") m_smem = T.alloc_buffer((32,), scope="shared") m_prev_smem = T.alloc_buffer((32,), scope="shared") d_smem = T.alloc_buffer((32,), scope="shared") m_new = T.alloc_buffer((1,), scope="local") m_prev = T.alloc_buffer((1,), scope="local") d_new = T.alloc_buffer((1,), scope="local") tile_id[0] = bx batch_idx[0] = 0 batch_rows[0] = q_indptr[1] - q_indptr[0] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 while T.tvm_thread_invariant(batch_idx[0] < batch_size): while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: tile_id[0] = tile_id[0] - batch_tiles[0] batch_idx[0] = batch_idx[0] + 1 if batch_idx[0] < batch_size: b_idx: T.int32 = batch_idx[0] batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 if T.tvm_thread_invariant(batch_idx[0] < batch_size): b_idx: T.int32 = batch_idx[0] LH_start: T.int32 = tile_id[0] * 32 q_indptr_val: T.int32 = q_indptr[b_idx] kv_chunk_len[0] = kv_indptr[b_idx + 1] - kv_indptr[b_idx] T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: m_smem[row] = T.float32(-50000) d_smem[row] = T.float32(1) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads() T.writes(O_local[i, j]) O_local[i, j] = T.float32(0) T.tvm_storage_sync("shared") for li_lj_fused_0 in range(4): for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for li_lj_fused_3 in T.vectorized(4): with T.block("Q_load"): i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = q_indptr_val + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * q[cur_L, cur_H_qo, j] + T.Cast("float16", T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]), q[cur_L, cur_H_qo, j]) else: Q_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") for iterator_1 in range((kv_chunk_len[0] + 15) // 16): L_kv_start: T.int32 = iterator_1 * 16 L_kv_base: T.int32 = kv_indptr[b_idx] for lz_ly_fused_0 in range(2): for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): for lz_ly_fused_3 in T.vectorized(4): with T.block("KV_load"): i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) T.reads() T.writes() cur_L: T.int32 = L_kv_base + L_kv_start + i if L_kv_start + i < kv_chunk_len[0]: K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * k[cur_L, by, j] + T.Cast("float16", T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * T.if_then_else(j < 32, k[cur_L, by, j + 32] * T.float16(-1), k[cur_L, by, j - 32]), k[cur_L, by, j]) V_smem[i, j] = v[cur_L, by, j] else: K_smem[i, j] = T.float16(0) V_smem[i, j] = T.float16(0) T.tvm_storage_sync("shared") with T.block(""): T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) T.writes(S_local[0:32, 0:16]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(2, 2): with T.block("S_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) T.reads() T.writes(S_local[i, j]) S_local[i, j] = T.float32(0) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): with T.block("S_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) k_1 = T.axis.reduce(64, lk_0 * 8 + lk_1) T.reads(S_local[i, j], Q_smem[i, k_1], K_smem[j, k_1]) T.writes(S_local[i, j]) S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k_1]) * T.Cast("float32", K_smem[j, k_1]) * attn_score_scaling_factor * T.float32(0.18033688011112042) T.tvm_storage_sync("shared") for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(2, 2): with T.block("S_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) T.reads(S_local[i, j]) T.writes(S_smem[i, j]) S_smem[i, j] = S_local[i, j] T.tvm_storage_sync("shared") for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update1"): T.reads(m_smem[row], kv_chunk_len[0], mask[mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start:mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start + 16], mn_indptr[b_idx], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) T.writes(m_prev[i], m_new[i], d_new[i]) m_prev[i] = m_smem[row] m_new[i] = m_smem[row] row_: T.int32 = LH_start + row for j in range(16): if L_kv_start + j < kv_chunk_len[0] and mask[mn_indptr[b_idx] + row_ * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + (L_kv_start + j)] == 1: m_new[i] = T.max(m_new[i], S_smem[row, j]) d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx with T.block("update"): T.reads(kv_chunk_len[0], mask[mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start:mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start + 16], mn_indptr[b_idx], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) T.writes(S_smem[row, 0:16]) for j in range(16): if row < 32: row_: T.int32 = LH_start + row if L_kv_start + j < kv_chunk_len[0] and mask[mn_indptr[b_idx] + row_ * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + (L_kv_start + j)] == 1: S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) else: S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) for i in range(1): row: T.int32 = i * 32 * 4 + ty * 32 + tx if row < 32: with T.block("update"): T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) for j in range(16): d_new[i] = d_new[i] + S_smem[row, j] m_smem[row] = m_new[i] d_smem[row] = d_new[i] m_prev_smem[row] = m_prev[i] T.tvm_storage_sync("shared") with T.block(""): T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) T.writes(O_local[0:32, 0:64]) for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): for li_1_init, lj_1_init in T.grid(4, 4): with T.block("O_gemm_init"): i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) T.reads() T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): with T.block("O_gemm_update"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) k_1 = T.axis.reduce(16, lk_0 * 8 + lk_1) T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k_1], V_smem[k_1, j]) T.writes(O_local[i, j]) O_local[i, j] = O_local[i, j] + S_smem[i, k_1] * T.Cast("float32", V_smem[k_1, j]) for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): for li_1, lj_1 in T.grid(4, 4): with T.block("O_store"): i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) for li_0 in range(1): for li_1 in T.thread_binding(4, thread="threadIdx.y"): for li_2 in T.thread_binding(32, thread="threadIdx.x"): with T.block("lse_store"): i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) cur_H_qo: T.int32 = by if cur_L < q_indptr[b_idx + 1]: lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) tile_id[0] = tile_id[0] + 16 @T.prim_func def batch_verify_on_gpu_single_kernel(var_draft_probs: T.handle, var_draft_tokens: T.handle, var_model_probs: T.handle, var_token_tree_first_child: T.handle, var_token_tree_next_sibling: T.handle, var_uniform_samples: T.handle, var_token_tree_parent_ptr: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) num_nodes, vocab_size = T.int32(is_size_var=True), T.int64() draft_probs = T.match_buffer(var_draft_probs, (num_nodes, vocab_size)) draft_tokens = T.match_buffer(var_draft_tokens, (num_nodes,), "int32") model_probs = T.match_buffer(var_model_probs, (num_nodes, vocab_size)) token_tree_first_child = T.match_buffer(var_token_tree_first_child, (num_nodes,), "int32") token_tree_next_sibling = T.match_buffer(var_token_tree_next_sibling, (num_nodes,), "int32") uniform_samples = T.match_buffer(var_uniform_samples, (num_nodes,)) nbatch = T.int32(is_size_var=True) token_tree_parent_ptr = T.match_buffer(var_token_tree_parent_ptr, (nbatch,), "int32") # with T.block("root"): child_ptr = T.alloc_buffer((1,), "int32", scope="local") parent_ptr = T.alloc_buffer((1,), "int32", scope="local") child_token = T.alloc_buffer((1,), "int32", scope="local") done = T.alloc_buffer((1,), "bool", scope="local") psum = T.alloc_buffer((1,), scope="local") t0 = T.alloc_buffer((1,), scope="local") model_prob_local = T.alloc_buffer((1,), scope="local") draft_prob_local = T.alloc_buffer((1,), scope="local") p_child = T.alloc_buffer((1,), scope="local") q_child = T.alloc_buffer((1,), scope="local") uniform_sample = T.alloc_buffer((1,), scope="local") pred_shared = T.alloc_buffer((1,), "bool", scope="shared") pred_local = T.alloc_buffer((1,), "bool", scope="local") for _bx in T.thread_binding(nbatch, thread="blockIdx.x"): for _tx in T.thread_binding(1024, thread="threadIdx.x"): with T.block("CTA"): b, tx = T.axis.remap("SS", [_bx, _tx]) T.reads(token_tree_parent_ptr[b], token_tree_first_child[T.min(parent_ptr[0], child_ptr[0]):T.min(parent_ptr[0], child_ptr[0]) + (T.max(parent_ptr[0], child_ptr[0]) + 1 - T.min(parent_ptr[0], child_ptr[0]))], parent_ptr[0], done[0], child_ptr[0], draft_tokens[child_ptr[0]], model_probs[parent_ptr[0], T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)):T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)) + (T.max(T.Cast("int64", child_token[0]), (vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) + T.Cast("int64", tx) - T.int64(1024)) + T.int64(1) - T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)))], child_token[0], draft_probs[child_ptr[0], T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)):T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)) + (T.max(T.Cast("int64", child_token[0]), (vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) + T.Cast("int64", tx) - T.int64(1024)) + T.int64(1) - T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)))], uniform_samples[child_ptr[0]], p_child[0], uniform_sample[0], q_child[0], pred_shared[0], pred_local[0], model_prob_local[0], draft_prob_local[0], psum[0], t0[0], token_tree_next_sibling[child_ptr[0]]) T.writes(parent_ptr[0], child_ptr[0], done[0], child_token[0], p_child[0], q_child[0], uniform_sample[0], pred_shared[0], pred_local[0], psum[0], model_prob_local[0], draft_prob_local[0], t0[0], model_probs[parent_ptr[0], T.Cast("int64", tx):T.Cast("int64", tx) + ((vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) - T.int64(1023))], token_tree_parent_ptr[b]) parent_ptr[0] = token_tree_parent_ptr[b] child_ptr[0] = token_tree_first_child[parent_ptr[0]] done[0] = T.bool(False) while not done[0]: T.tvm_storage_sync("shared") if child_ptr[0] == -1: done[0] = T.bool(True) T.tvm_storage_sync("shared") else: if tx == 0: child_token[0] = draft_tokens[child_ptr[0]] p_child[0] = model_probs[parent_ptr[0], child_token[0]] q_child[0] = draft_probs[child_ptr[0], child_token[0]] uniform_sample[0] = uniform_samples[child_ptr[0]] pred_shared[0] = p_child[0] >= uniform_sample[0] * q_child[0] T.tvm_storage_sync("shared") pred_local[0] = pred_shared[0] if pred_local[0]: parent_ptr[0] = child_ptr[0] child_ptr[0] = token_tree_first_child[child_ptr[0]] else: psum[0] = T.float32(0) for i in range((vocab_size + T.int64(1023)) // T.int64(1024)): if i * T.int64(1024) + T.Cast("int64", tx) < vocab_size: model_prob_local[0] = model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] draft_prob_local[0] = draft_probs[child_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] model_prob_local[0] = T.max(model_prob_local[0] - draft_prob_local[0], T.float32(0)) psum[0] = psum[0] + model_prob_local[0] with T.block("block_cross_thread"): T.reads(psum[0]) T.writes(t0[0]) T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) T.tvm_thread_allreduce(T.uint32(1), psum[0], T.bool(True), t0[0], tx) if t0[0] < T.float32(9.9999999999999995e-08): parent_ptr[0] = child_ptr[0] child_ptr[0] = token_tree_first_child[child_ptr[0]] else: for i in range((vocab_size + T.int64(1023)) // T.int64(1024)): if i * T.int64(1024) + T.Cast("int64", tx) < vocab_size: model_prob_local[0] = model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] draft_prob_local[0] = draft_probs[child_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] model_prob_local[0] = T.max(model_prob_local[0] - draft_prob_local[0], T.float32(0)) model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] = model_prob_local[0] / t0[0] child_ptr[0] = token_tree_next_sibling[child_ptr[0]] if tx == 0: token_tree_parent_ptr[b] = parent_ptr[0] @T.prim_func def chunk_lse(var_A: T.handle, var_temperature: T.handle, var_chunked_sum: T.handle, var_chunked_max: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True) A = T.match_buffer(var_A, (batch_size, vocab_size)) temperature = T.match_buffer(var_temperature, (batch_size,)) num_chunks = T.int64(is_size_var=True) chunked_sum = T.match_buffer(var_chunked_sum, (batch_size, num_chunks)) chunked_max = T.match_buffer(var_chunked_max, (batch_size, num_chunks)) # with T.block("root"): temp_max_shared = T.alloc_buffer((batch_size, num_chunks), scope="shared") temp_sum_shared = T.alloc_buffer((batch_size, num_chunks), scope="shared") for ax0_ax1_fused in T.thread_binding(batch_size * num_chunks, thread="blockIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax2_fused_0 in T.serial(T.int64(16), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("max"): v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks + ax0) v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks + ax1) v2 = T.axis.reduce(T.int64(4096), ax2_fused_0 * T.int64(256) + ax2_fused_1) T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2]) T.writes(temp_max_shared[v0, v1]) with T.init(): temp_max_shared[v0, v1] = T.float32(-3.4028234663852886e+38) temp_max_shared[v0, v1] = T.max(temp_max_shared[v0, v1], T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38))) for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax2_fused_0 in T.serial(T.int64(16), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("sum_exp"): v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks + ax0) v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks + ax1) v2 = T.axis.reduce(T.int64(4096), ax2_fused_0 * T.int64(256) + ax2_fused_1) T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2], temp_max_shared[v0, v1]) T.writes(temp_sum_shared[v0, v1]) with T.init(): temp_sum_shared[v0, v1] = T.float32(0) temp_sum_shared[v0, v1] = temp_sum_shared[v0, v1] + T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38)) - temp_max_shared[v0, v1]), T.Cast("float32", T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38)) == temp_max_shared[v0, v1])), T.float32(0)) for ax2_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("log"): v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks) v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks) v2 = T.axis.spatial(T.int64(1), ax2_0 * T.int64(256) + ax2_1) T.where(ax2_0 * T.int64(256) + ax2_1 < T.int64(1)) T.reads(temperature[v0], temp_sum_shared[v0, v1], temp_max_shared[v0, v1]) T.writes(chunked_sum[v0, v1], chunked_max[v0, v1]) chunked_sum[v0, v1] = T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.log(temp_sum_shared[v0, v1]), temp_sum_shared[v0, v1]) chunked_max[v0, v1] = temp_max_shared[v0, v1] @T.prim_func def compact_kv_copy(var_pages: T.handle, var_copy_length_indptr: T.handle, var_copy_src_dst_pos: T.handle, batch_size: T.int32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) num_pages = T.int32() pages = T.match_buffer(var_pages, (num_pages, 2, 20, 16, 64), "float16") copy_length_indptr = T.match_buffer(var_copy_length_indptr, (batch_size + 1,), "int32", offset_factor=1) total_copy_length = T.int32() copy_src_dst_pos = T.match_buffer(var_copy_src_dst_pos, (2, total_copy_length), "int32", offset_factor=1) with T.block("root"): T.reads() T.writes() for bhd_o in T.thread_binding((batch_size * 1280 + 1023) // 1024, thread="blockIdx.x"): for bhd_i in T.thread_binding(1024, thread="threadIdx.x"): b: T.int32 = (bhd_o * 1024 + bhd_i) // 1280 h: T.int32 = (bhd_o * 1024 + bhd_i) // 64 % 20 d: T.int32 = (bhd_o * 1024 + bhd_i) % 64 if bhd_o * 1024 + bhd_i < batch_size * 20 * 64: for i in range(copy_length_indptr[b + 1] - copy_length_indptr[b]): src_pos: T.int32 = copy_src_dst_pos[0, copy_length_indptr[b] + i] dst_pos: T.int32 = copy_src_dst_pos[1, copy_length_indptr[b] + i] pages[dst_pos // 16, 0, h, dst_pos % 16, d] = pages[src_pos // 16, 0, h, src_pos % 16, d] pages[dst_pos // 16, 1, h, dst_pos % 16, d] = pages[src_pos // 16, 1, h, src_pos % 16, d] @T.prim_func def concatenate(var_reshape710: T.handle, var_reshape711: T.handle, var_reshape712: T.handle, var_T_concat: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() reshape710 = T.match_buffer(var_reshape710, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") reshape711 = T.match_buffer(var_reshape711, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") reshape712 = T.match_buffer(var_reshape712, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") T_concat = T.match_buffer(var_T_concat, (batch_size, T.int64(1), T.int64(60), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_concat"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(3840)) T.reads(reshape712[v0, T.int64(0), v1 + T.int64(-40), v2], reshape711[v0, T.int64(0), v1 + T.int64(-20), v2], reshape710[v0, T.int64(0), v1, v2]) T.writes(T_concat[v0, T.int64(0), v1, v2]) T_concat[v0, T.int64(0), v1, v2] = T.if_then_else(T.int64(40) <= v1, reshape712[v0, T.int64(0), v1 - T.int64(40), v2], T.if_then_else(T.int64(20) <= v1, reshape711[v0, T.int64(0), v1 + T.int64(-20), v2], reshape710[v0, T.int64(0), v1, v2])) @T.prim_func def concatenate1(var_reshape387: T.handle, var_reshape388: T.handle, var_reshape389: T.handle, var_T_concat: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() reshape387 = T.match_buffer(var_reshape387, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") reshape388 = T.match_buffer(var_reshape388, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") reshape389 = T.match_buffer(var_reshape389, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") T_concat = T.match_buffer(var_T_concat, (T.int64(1), seq_len, T.int64(60), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_concat"): v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(3840)) T.reads(reshape389[T.int64(0), v0, v1 + T.int64(-40), v2], reshape388[T.int64(0), v0, v1 + T.int64(-20), v2], reshape387[T.int64(0), v0, v1, v2]) T.writes(T_concat[T.int64(0), v0, v1, v2]) T_concat[T.int64(0), v0, v1, v2] = T.if_then_else(T.int64(40) <= v1, reshape389[T.int64(0), v0, v1 - T.int64(40), v2], T.if_then_else(T.int64(20) <= v1, reshape388[T.int64(0), v0, v1 + T.int64(-20), v2], reshape387[T.int64(0), v0, v1, v2])) @T.prim_func def copy_single_page(var_pages: T.handle, src_page_id: T.int64, tgt_page_id: T.int64, copy_length: T.int64): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) num_pages, page_size = T.int32(), T.int64() pages = T.match_buffer(var_pages, (num_pages, 2, 20, page_size, 64), "float16") # with T.block("root"): for b in T.thread_binding((copy_length * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for t in T.thread_binding(1024, thread="threadIdx.x"): with T.block("copy"): vh = T.axis.spatial(20, T.Cast("int32", (b * T.int64(1024) + T.Cast("int64", t)) // (copy_length * T.int64(64)))) vp = T.axis.spatial(copy_length, (b * T.int64(1024) + T.Cast("int64", t)) % (copy_length * T.int64(64)) // T.int64(64)) vd = T.axis.spatial(64, T.Cast("int32", (b * T.int64(1024) + T.Cast("int64", t)) % T.int64(64))) T.reads(pages[src_page_id, 0:2, vh, vp, vd]) T.writes(pages[tgt_page_id, 0:2, vh, vp, vd]) pages[tgt_page_id, 0, vh, vp, vd] = pages[src_page_id, 0, vh, vp, vd] pages[tgt_page_id, 1, vh, vp, vd] = pages[src_page_id, 1, vh, vp, vd] @T.prim_func def cumsum(var_sorted_probs: T.handle, var_lv1: T.handle, var_exclusive_scan_thrust: T.handle): T.func_attr({"tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int64(), T.int64() data_buf = T.match_buffer(var_sorted_probs, (batch_size, vocab_size), align=8) workspace_buf = T.match_buffer(var_lv1, (T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12),), "uint8", align=8) output_buf = T.match_buffer(var_exclusive_scan_thrust, (batch_size, vocab_size), align=8) with T.block("exclusive_scan_thrust"): T.reads() T.writes() T.call_packed("tvm.contrib.thrust.sum_scan", T.tvm_stack_make_array(data_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(output_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.bool(False), T.tvm_stack_make_array(workspace_buf.data, T.tvm_stack_make_shape(T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12)), 0, 1, T.uint8(0), T.int64(0))) @T.prim_func def full(var_result: T.handle, value: T.int32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) batch_size = T.int32(is_size_var=True) result = T.match_buffer(var_result, (batch_size, 1), "int32") # with T.block("root"): for ax0_fused_0 in T.thread_binding((batch_size + 1023) // 1024, thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): with T.block("block"): v0 = T.axis.spatial(batch_size, ax0_fused_0 * 1024 + ax0_fused_1) T.where(ax0_fused_0 * 1024 + ax0_fused_1 < batch_size) T.reads() T.writes(result[v0, 0]) result[v0, 0] = value @T.prim_func def fused_NT_matmul1_add8_gelu2(layer_norm358: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_fc1_weight5: T.Buffer((T.int64(5120), T.int64(1280)), "float16"), model_decoder_layers_0_fc1_bias5: T.Buffer((T.int64(5120),), "float16"), T_multiply_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local") NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(256), T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local") NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(64), T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local") model_decoder_layers_0_fc1_weight5_local = T.alloc_buffer((T.int64(5120), T.int64(1280)), "float16", scope="local") layer_norm358_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(1280), thread="blockIdx.x"): for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): for ax2_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax2_2 in T.thread_binding(T.int64(64), thread="threadIdx.x"): for ax2_3 in T.vectorized(T.int64(1)): with T.block("layer_norm358_shared"): v0, v1 = T.axis.remap("SS", [ax0, ax1]) v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1 * T.int64(64) + ax2_2 + ax2_3) T.reads(layer_norm358[v0, v1, v2]) T.writes(layer_norm358_shared[v0, v1, v2]) layer_norm358_shared[v0, v1, v2] = layer_norm358[v0, v1, v2] for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) T.reads() T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax0_ax1_fused_0 in range(T.int64(2)): for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): with T.block("model_decoder_layers_0_fc1_weight5_local"): v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1) v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) T.reads(model_decoder_layers_0_fc1_weight5[v0, v1]) T.writes(model_decoder_layers_0_fc1_weight5_local[v0, v1]) model_decoder_layers_0_fc1_weight5_local[v0, v1] = model_decoder_layers_0_fc1_weight5[v0, v1] for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) vax1_fused_u_fused_2, vax1_fused_u_fused_0 = T.axis.remap("RR", [ax1_fused_u_fused_2, ax1_fused_u_fused_0]) T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm358_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused], model_decoder_layers_0_fc1_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm358_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused] * model_decoder_layers_0_fc1_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused] for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax2_fused_2_1 in T.vectorized(T.int64(1)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(64), ax0) v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads() T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1 in range(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] for ax1_fused_2 in range(T.int64(1)): for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): with T.block("NT_matmul"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(64), ax0) v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) with T.init(): NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): for ax0_fused_2 in range(T.int64(1)): with T.block("T_multiply_2"): v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) T.reads(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_fc1_bias5[v0]) T.writes(T_multiply_intermediate[T.int64(0), T.int64(0), v0]) T_multiply_intermediate[T.int64(0), T.int64(0), v0] = (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc1_bias5[v0]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc1_bias5[v0]) * T.float16(0.70710678118654757)))) * T.float16(0.5)) @T.prim_func def fused_NT_matmul2_add7_add6(gelu130: T.Buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16"), model_decoder_layers_0_fc2_weight5: T.Buffer((T.int64(1280), T.int64(5120)), "float16"), model_decoder_layers_0_fc2_bias5: T.Buffer((T.int64(1280),), "float16"), add1227: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_add_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") model_decoder_layers_0_fc2_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(5120)), "float16", scope="local") gelu130_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="shared") for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_3 in T.vectorized(T.int64(2)): with T.block("gelu130_shared"): v0, v1 = T.axis.remap("SS", [ax0, ax1]) v2 = T.axis.spatial(T.int64(5120), ax2_0 * T.int64(1024) + ax2_1 * T.int64(64) + ax2_2 * T.int64(2) + ax2_3) T.reads(gelu130[v0, v1, v2]) T.writes(gelu130_shared[v0, v1, v2]) gelu130_shared[v0, v1, v2] = gelu130[v0, v1, v2] for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) T.reads() T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1_fused_u_fused_0 in T.serial(T.int64(20), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax0_ax1_fused_0 in range(T.int64(4)): for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): with T.block("model_decoder_layers_0_fc2_weight5_local"): v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) v1 = T.axis.spatial(T.int64(5120), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) T.reads(model_decoder_layers_0_fc2_weight5[v0, v1]) T.writes(model_decoder_layers_0_fc2_weight5_local[v0, v1]) model_decoder_layers_0_fc2_weight5_local[v0, v1] = model_decoder_layers_0_fc2_weight5[v0, v1] for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], gelu130_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_fc2_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + gelu130_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_fc2_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax2_fused_2_1 in T.vectorized(T.int64(1)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads() T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1 in range(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] for ax1_fused_2 in range(T.int64(1)): for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): with T.block("NT_matmul"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) with T.init(): NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0_fused_2 in range(T.int64(1)): with T.block("T_add_1"): v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) T.reads(add1227[T.int64(0), T.int64(0), v0], NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_fc2_bias5[v0]) T.writes(T_add_intermediate_1[T.int64(0), T.int64(0), v0]) T_add_intermediate_1[T.int64(0), T.int64(0), v0] = add1227[T.int64(0), T.int64(0), v0] + (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc2_bias5[v0]) @T.prim_func def fused_NT_matmul_add7(layer_norm356: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_bias5: T.Buffer((T.int64(1280),), "float16"), T_add_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") model_decoder_layers_0_self_attn_q_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local") layer_norm356_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_3 in T.vectorized(T.int64(1)): with T.block("layer_norm356_shared"): v0, v1 = T.axis.remap("SS", [ax0, ax1]) v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3) T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280)) T.reads(layer_norm356[v0, v1, v2]) T.writes(layer_norm356_shared[v0, v1, v2]) layer_norm356_shared[v0, v1, v2] = layer_norm356[v0, v1, v2] for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) T.reads() T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax0_ax1_fused_0 in range(T.int64(4)): for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): with T.block("model_decoder_layers_0_self_attn_q_proj_weight5_local"): v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) T.reads(model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1]) T.writes(model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1]) model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1] for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax2_fused_2_1 in T.vectorized(T.int64(1)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads() T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1 in range(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] for ax1_fused_2 in range(T.int64(1)): for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): with T.block("NT_matmul"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) with T.init(): NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0_fused_2 in range(T.int64(1)): with T.block("T_add"): v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) T.reads(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_self_attn_q_proj_bias5[v0]) T.writes(T_add_intermediate[T.int64(0), T.int64(0), v0]) T_add_intermediate[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_self_attn_q_proj_bias5[v0] @T.prim_func def fused_NT_matmul_add7_add6(reshape1361: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_out_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_out_proj_bias5: T.Buffer((T.int64(1280),), "float16"), add1220: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_add_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") model_decoder_layers_0_self_attn_out_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local") reshape1361_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_3 in T.vectorized(T.int64(1)): with T.block("reshape1361_shared"): v0, v1 = T.axis.remap("SS", [ax0, ax1]) v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3) T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280)) T.reads(reshape1361[v0, v1, v2]) T.writes(reshape1361_shared[v0, v1, v2]) reshape1361_shared[v0, v1, v2] = reshape1361[v0, v1, v2] for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) T.reads() T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax0_ax1_fused_0 in range(T.int64(4)): for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): with T.block("model_decoder_layers_0_self_attn_out_proj_weight5_local"): v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) T.reads(model_decoder_layers_0_self_attn_out_proj_weight5[v0, v1]) T.writes(model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, v1]) model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_out_proj_weight5[v0, v1] for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], reshape1361_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + reshape1361_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): for ax2_fused_2_1 in T.vectorized(T.int64(1)): with T.block("NT_matmul_rf_init"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads() T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) for ax1 in range(T.int64(4)): with T.block("NT_matmul_rf_update"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] for ax1_fused_2 in range(T.int64(1)): for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): with T.block("NT_matmul"): vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) with T.init(): NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): for ax0_fused_2 in range(T.int64(1)): with T.block("T_add_1"): v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) T.reads(add1220[T.int64(0), T.int64(0), v0], NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_self_attn_out_proj_bias5[v0]) T.writes(T_add_intermediate_1[T.int64(0), T.int64(0), v0]) T_add_intermediate_1[T.int64(0), T.int64(0), v0] = add1220[T.int64(0), T.int64(0), v0] + (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_self_attn_out_proj_bias5[v0]) @T.prim_func def fused_add4_maximum_minimum(p_add4: T.handle, p_lv611: T.handle, p_output0: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() add4 = T.match_buffer(p_add4, (batch_size, T.int64(1500), T.int64(1280)), "float16") lv611 = T.match_buffer(p_lv611, (batch_size, T.int64(1500), T.int64(1280)), "float16") T_minimum_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1500), T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_minimum"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) T.reads(add4[v0, v1, v2], lv611[v0, v1, v2]) T.writes(T_minimum_intermediate[v0, v1, v2]) T_minimum_intermediate[v0, v1, v2] = T.min(T.max(add4[v0, v1, v2] + lv611[v0, v1, v2], T.float16(-65504)), T.float16(65504)) @T.prim_func def fused_conv1d1_add2_gelu1(p_gelu: T.handle, model_encoder_conv2_weight: T.Buffer((T.int64(1280), T.int64(1280), T.int64(3)), "float16"), lv3: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16"), p_output0: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() gelu = T.match_buffer(p_gelu, (batch_size, T.int64(1280), T.int64(3000)), "float16") T_multiply_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1280), T.int64(1500)), "float16") # with T.block("root"): conv1d_ncw_intermediate_shared = T.alloc_buffer((batch_size, T.int64(1280), T.int64(1500)), "float16", scope="shared") for ax0_ax1_ax2_fused in T.thread_binding(batch_size * T.int64(1920000), thread="blockIdx.x"): for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(1), T.int64(1)): for ax3_ax4_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax3_ax4_fused_0 in T.serial(T.int64(15), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("conv1d_ncw"): v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(1920000) + ax0) v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(1920000) // T.int64(1500) + ax1) v2 = T.axis.spatial(T.int64(1500), ax0_ax1_ax2_fused % T.int64(1500) + ax2) v3 = T.axis.reduce(T.int64(1280), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) // T.int64(3)) v4 = T.axis.reduce(T.int64(3), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) % T.int64(3)) T.reads(gelu[v0, v3, v2 * T.int64(2) + v4 - T.int64(1)], model_encoder_conv2_weight[v1, v3, v4]) T.writes(conv1d_ncw_intermediate_shared[v0, v1, v2]) with T.init(): conv1d_ncw_intermediate_shared[v0, v1, v2] = T.float16(0) conv1d_ncw_intermediate_shared[v0, v1, v2] = conv1d_ncw_intermediate_shared[v0, v1, v2] + T.if_then_else(T.int64(1) <= v2 * T.int64(2) + v4 and v2 * T.int64(2) + v4 < T.int64(3001), gelu[v0, v3, v2 * T.int64(2) + v4 - T.int64(1)], T.float16(0)) * model_encoder_conv2_weight[v1, v3, v4] for ax3 in range(T.int64(1)): for ax4_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax4_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("T_multiply_2"): v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(1920000)) v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(1920000) // T.int64(1500)) v2 = T.axis.spatial(T.int64(1500), ax0_ax1_ax2_fused % T.int64(1500)) v3 = T.axis.spatial(T.int64(1), ax3) v4 = T.axis.spatial(T.int64(1), ax4_0 * T.int64(256) + ax4_1) T.where(ax4_0 * T.int64(256) + ax4_1 < T.int64(1)) T.reads(conv1d_ncw_intermediate_shared[v0, v1, v2], lv3[T.int64(0), v1, T.int64(0)]) T.writes(T_multiply_intermediate[v0, v1, v2]) T_multiply_intermediate[v0, v1, v2] = (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv3[T.int64(0), v1, T.int64(0)]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv3[T.int64(0), v1, T.int64(0)]) * T.float16(0.70710678118654757)))) * T.float16(0.5)) @T.prim_func def fused_conv1d_add1_gelu(p_input_features: T.handle, model_encoder_conv1_weight: T.Buffer((T.int64(1280), T.int64(128), T.int64(3)), "float16"), lv1: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16"), p_output0: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() input_features = T.match_buffer(p_input_features, (batch_size, T.int64(128), T.int64(3000)), "float16") T_multiply_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1280), T.int64(3000)), "float16") # with T.block("root"): conv1d_ncw_intermediate_shared = T.alloc_buffer((batch_size, T.int64(1280), T.int64(3000)), "float16", scope="shared") for ax0_ax1_ax2_fused in T.thread_binding(batch_size * T.int64(3840000), thread="blockIdx.x"): for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(1), T.int64(1)): for ax3_ax4_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax3_ax4_fused_0 in T.serial(T.int64(2), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("conv1d_ncw"): v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(3840000) + ax0) v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(3840000) // T.int64(3000) + ax1) v2 = T.axis.spatial(T.int64(3000), ax0_ax1_ax2_fused % T.int64(3000) + ax2) v3 = T.axis.reduce(T.int64(128), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) // T.int64(3)) v4 = T.axis.reduce(T.int64(3), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) % T.int64(3)) T.where(ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1 < T.int64(384)) T.reads(input_features[v0, v3, v2 + v4 - T.int64(1)], model_encoder_conv1_weight[v1, v3, v4]) T.writes(conv1d_ncw_intermediate_shared[v0, v1, v2]) with T.init(): conv1d_ncw_intermediate_shared[v0, v1, v2] = T.float16(0) conv1d_ncw_intermediate_shared[v0, v1, v2] = conv1d_ncw_intermediate_shared[v0, v1, v2] + T.if_then_else(T.int64(1) <= v2 + v4 and v2 + v4 < T.int64(3001), input_features[v0, v3, v2 + v4 - T.int64(1)], T.float16(0)) * model_encoder_conv1_weight[v1, v3, v4] for ax3 in range(T.int64(1)): for ax4_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax4_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("T_multiply_2"): v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(3840000)) v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(3840000) // T.int64(3000)) v2 = T.axis.spatial(T.int64(3000), ax0_ax1_ax2_fused % T.int64(3000)) v3 = T.axis.spatial(T.int64(1), ax3) v4 = T.axis.spatial(T.int64(1), ax4_0 * T.int64(256) + ax4_1) T.where(ax4_0 * T.int64(256) + ax4_1 < T.int64(1)) T.reads(conv1d_ncw_intermediate_shared[v0, v1, v2], lv1[T.int64(0), v1, T.int64(0)]) T.writes(T_multiply_intermediate[v0, v1, v2]) T_multiply_intermediate[v0, v1, v2] = (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv1[T.int64(0), v1, T.int64(0)]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv1[T.int64(0), v1, T.int64(0)]) * T.float16(0.70710678118654757)))) * T.float16(0.5)) @T.prim_func def fused_reshape20_reshape20_add6(take7: T.Buffer((T.int64(1), T.int64(1280)), "float16"), take8: T.Buffer((T.int64(1), T.int64(1280)), "float16"), T_add_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_add"): v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) T.reads(take7[T.int64(0), v0], take8[T.int64(0), v0]) T.writes(T_add_intermediate[T.int64(0), T.int64(0), v0]) T_add_intermediate[T.int64(0), T.int64(0), v0] = take7[T.int64(0), v0] + take8[T.int64(0), v0] @T.prim_func def fused_reshape21_reshape21_reshape21_concatenate2_reshape22(add1221: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), lv1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), add1222: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_reshape_intermediate_1_2_3: T.Buffer((T.int64(1), T.int64(60), T.int64(64)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding(T.int64(4), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape_3"): v0 = T.axis.spatial(T.int64(60), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(64)) v1 = T.axis.spatial(T.int64(64), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(64)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < T.int64(3840)) T.reads(add1222[T.int64(0), T.int64(0), (v0 - T.int64(40)) * T.int64(64) + v1], lv1[T.int64(0), T.int64(0), (v0 + T.int64(-20)) * T.int64(64) + v1], add1221[T.int64(0), T.int64(0), v0 * T.int64(64) + v1]) T.writes(T_reshape_intermediate_1_2_3[T.int64(0), v0, v1]) T_reshape_intermediate_1_2_3[T.int64(0), v0, v1] = T.if_then_else(T.int64(40) <= v0, add1222[T.int64(0), T.int64(0), (v0 - T.int64(40)) * T.int64(64) + v1], T.if_then_else(T.int64(20) <= v0, lv1[T.int64(0), T.int64(0), (v0 + T.int64(-20)) * T.int64(64) + v1], add1221[T.int64(0), T.int64(0), v0 * T.int64(64) + v1])) @T.prim_func def fused_reshape21_reshape25(add1225: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_reshape_intermediate_1: T.Buffer((T.int64(1), T.int64(20), T.int64(64)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape_1"): v0 = T.axis.spatial(T.int64(20), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(64)) v1 = T.axis.spatial(T.int64(64), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(64)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < T.int64(1280)) T.reads(add1225[T.int64(0), T.int64(0), v0 * T.int64(64) + v1]) T.writes(T_reshape_intermediate_1[T.int64(0), v0, v1]) T_reshape_intermediate_1[T.int64(0), v0, v1] = add1225[T.int64(0), T.int64(0), v0 * T.int64(64) + v1] @T.prim_func def fused_reshape23_reshape24(lv265: T.Buffer((T.int64(1), T.int64(20), T.int64(64)), "float16"), T_reshape_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape_1"): v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) T.reads(lv265[T.int64(0), v0 // T.int64(64), v0 % T.int64(64)]) T.writes(T_reshape_intermediate_1[T.int64(0), T.int64(0), v0]) T_reshape_intermediate_1[T.int64(0), T.int64(0), v0] = lv265[T.int64(0), v0 // T.int64(64), v0 % T.int64(64)] @T.prim_func def fused_reshape9(packed_params_1: T.Buffer((T.int64(1280),), "float16"), T_reshape_intermediate: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) T.reads(packed_params_1[v0]) T.writes(T_reshape_intermediate[T.int64(0), v0, T.int64(0)]) T_reshape_intermediate[T.int64(0), v0, T.int64(0)] = packed_params_1[v0] @T.prim_func def fused_rope(var_qkv: T.handle, var_position_map: T.handle, var_q: T.handle, var_k: T.handle, var_v: T.handle, apply_rope: T.int32): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() qkv = T.match_buffer(var_qkv, (seq_len, 60, 64), "float16") position_map = T.match_buffer(var_position_map, (seq_len,), "int32", offset_factor=1) q = T.match_buffer(var_q, (seq_len, 20, 64), "float16") k = T.match_buffer(var_k, (seq_len, 20, 64), "float16") v = T.match_buffer(var_v, (seq_len, 20, 64), "float16") # with T.block("root"): for iters_0_iters_1_iters_2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for iters_0_iters_1_iters_2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("llama_fused_rope"): s = T.axis.spatial(seq_len, (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) // T.int64(3840)) h = T.axis.spatial(60, T.Cast("int32", (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) % T.int64(3840) // T.int64(64))) d = T.axis.spatial(64, T.Cast("int32", (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) % T.int64(64))) T.where(iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1 < seq_len * T.int64(3840)) T.reads(position_map[s], qkv[s, h, d - 32:d - 32 + 65]) T.writes(q[s, h, d], k[s, h - 20, d], v[s, h - 40, d]) if h < 20: q[s, h, d] = T.if_then_else(apply_rope > 0 and d < 64, T.Cast("float16", T.cos(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", qkv[s, h, d]) + T.sin(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(d < 32, qkv[s, h, d + 32] * T.float16(-1), qkv[s, h, d - 32]))), qkv[s, h, d]) else: if h < 40: k[s, h - 20, d] = T.if_then_else(apply_rope > 0 and d < 64, T.Cast("float16", T.cos(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", qkv[s, h, d]) + T.sin(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(d < 32, qkv[s, h, d + 32] * T.float16(-1), qkv[s, h, d - 32]))), qkv[s, h, d]) else: v[s, h - 40, d] = qkv[s, h, d] @T.prim_func def fused_transpose_add3(packed_params_4: T.Buffer((T.int64(1500), T.int64(1280)), "float16"), p_gelu1: T.handle, p_output0: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() gelu1 = T.match_buffer(p_gelu1, (batch_size, T.int64(1280), T.int64(1500)), "float16") T_add_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1500), T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_add"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) T.reads(gelu1[v0, v2, v1], packed_params_4[v1, v2]) T.writes(T_add_intermediate[v0, v1, v2]) T_add_intermediate[v0, v1, v2] = gelu1[v0, v2, v1] + packed_params_4[v1, v2] @T.prim_func def gather_probs(var_src: T.handle, var_indices: T.handle, var_dst: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) m, n = T.int32(is_size_var=True), T.int32(is_size_var=True) src = T.match_buffer(var_src, (m, n)) batch_size = T.int32(is_size_var=True) indices = T.match_buffer(var_indices, (batch_size,), "int32") dst = T.match_buffer(var_dst, (batch_size, n)) # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * n + 1023) // 1024, thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): with T.block("gather_2d"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % (n * batch_size) // n) v1 = T.axis.spatial(n, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % n) T.where(ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1 < batch_size * n) T.reads(src[indices[v0], v1], indices[v0]) T.writes(dst[v0, v1]) dst[v0, v1] = src[indices[v0], v1] @T.prim_func def get_index_from_sorted(A: T.handle, B: T.handle, C: T.handle, D: T.handle, E: T.handle, F: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) batch, vocab_size = T.int64(), T.int64() cumsum_sorted = T.match_buffer(A, (batch, vocab_size)) indices = T.match_buffer(B, (batch, vocab_size), "int32") renorm_prob = T.match_buffer(C, (batch, 1)) out_batch = T.int64() usample = T.match_buffer(D, (out_batch, 1)) sample_indices = T.match_buffer(E, (out_batch, 1), "int32") output_index = T.match_buffer(F, (out_batch, 1), "int32") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((out_batch * vocab_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_get_index_from_sorted"): v0 = T.axis.spatial(out_batch, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size * out_batch) // vocab_size) v1 = T.axis.spatial(vocab_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < out_batch * vocab_size) T.reads(usample[v0, T.int64(0)], cumsum_sorted[sample_indices[v0, T.int64(0)], v1 - T.int64(1):v1 - T.int64(1) + T.int64(2)], sample_indices[v0, T.int64(0)], renorm_prob[sample_indices[v0, T.int64(0)], 0], indices[sample_indices[v0, T.int64(0)], T.min(T.int64(0), v1):T.min(T.int64(0), v1) + (v1 + T.int64(1))]) T.writes(output_index[v0, 0]) if usample[v0, T.int64(0)] < cumsum_sorted[sample_indices[v0, T.int64(0)], v1] / renorm_prob[sample_indices[v0, T.int64(0)], 0] or v1 + T.int64(1) == vocab_size: if v1 == T.int64(0): output_index[v0, 0] = indices[sample_indices[v0, T.int64(0)], 0] else: if usample[v0, T.int64(0)] >= cumsum_sorted[sample_indices[v0, T.int64(0)], v1 - T.int64(1)] / renorm_prob[sample_indices[v0, T.int64(0)], 0]: output_index[v0, 0] = indices[sample_indices[v0, T.int64(0)], v1] @T.prim_func def get_renorm_prob(A: T.handle, B: T.handle, C: T.handle, D: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) batch, vocab_size = T.int64(), T.int64() cumsum_sorted = T.match_buffer(A, (batch, vocab_size)) top_p = T.match_buffer(B, (batch, 1)) top_k = T.match_buffer(C, (batch, 1), "int32") renorm_prob = T.match_buffer(D, (batch, 1)) # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch * vocab_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_get_renorm_prob"): v0 = T.axis.spatial(batch, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size * batch) // vocab_size) v1 = T.axis.spatial(vocab_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch * vocab_size) T.reads(cumsum_sorted[v0, T.min(T.min(T.int64(0), v1), v1 + T.int64(1)):T.min(T.min(T.int64(0), v1), v1 + T.int64(1)) + (v1 + T.int64(2))], top_p[v0, 0], top_k[v0, 0]) T.writes(renorm_prob[v0, 0]) if not (cumsum_sorted[v0, 0] < top_p[v0, 0] and top_k[v0, 0] > 1): renorm_prob[v0, 0] = cumsum_sorted[v0, 0] else: if cumsum_sorted[v0, v1] < top_p[v0, 0] and v1 + T.int64(1) < T.Cast("int64", top_k[v0, 0]): if v1 + T.int64(1) == vocab_size: renorm_prob[v0, 0] = cumsum_sorted[v0, v1] else: if not (cumsum_sorted[v0, v1 + T.int64(1)] < top_p[v0, 0] and v1 + T.int64(1) + T.int64(1) < T.Cast("int64", top_k[v0, 0])): renorm_prob[v0, 0] = cumsum_sorted[v0, v1 + T.int64(1)] @T.prim_func def index(var_layer_norm355: T.handle, index: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() layer_norm355 = T.match_buffer(var_layer_norm355, (T.int64(1), seq_len, T.int64(1280)), "float16") # with T.block("root"): for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("index"): v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) T.reads(layer_norm355[T.int64(0), seq_len - T.int64(1), v0]) T.writes(index[T.int64(0), T.int64(0), v0]) index[T.int64(0), T.int64(0), v0] = layer_norm355[T.int64(0), seq_len - T.int64(1), v0] @T.prim_func def layer_norm(var_add578: T.handle, model_decoder_layers_0_self_attn_layer_norm_weight3: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias3: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() add578 = T.match_buffer(var_add578, (batch_size, T.int64(1), T.int64(1280)), "float16") T_layer_norm = T.match_buffer(var_T_layer_norm, (batch_size, T.int64(1), T.int64(1280)), "float16") # with T.block("root"): add578_red_temp_v0_shared = T.alloc_buffer((batch_size, T.int64(1)), scope="shared") add578_red_temp_v1_shared = T.alloc_buffer((batch_size, T.int64(1)), scope="shared") for ax0_fused in T.thread_binding(batch_size, thread="blockIdx.x"): for ax0 in range(T.int64(1)): for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("add578_red_temp"): v0 = T.axis.spatial(batch_size, ax0_fused + ax0) v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1) T.reads(add578[v0, T.int64(0), v1]) T.writes(add578_red_temp_v0_shared[v0, T.int64(0)], add578_red_temp_v1_shared[v0, T.int64(0)]) with T.init(): add578_red_temp_v0_shared[v0, T.int64(0)] = T.float32(0) add578_red_temp_v1_shared[v0, T.int64(0)] = T.float32(0) v_add578_red_temp_v0: T.float32 = add578_red_temp_v0_shared[v0, T.int64(0)] + T.Cast("float32", add578[v0, T.int64(0), v1]) v_add578_red_temp_v1: T.float32 = add578_red_temp_v1_shared[v0, T.int64(0)] + T.Cast("float32", add578[v0, T.int64(0), v1]) * T.Cast("float32", add578[v0, T.int64(0), v1]) add578_red_temp_v0_shared[v0, T.int64(0)] = v_add578_red_temp_v0 add578_red_temp_v1_shared[v0, T.int64(0)] = v_add578_red_temp_v1 for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("T_layer_norm"): v0 = T.axis.spatial(batch_size, ax0_fused) v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1) T.reads(add578[v0, T.int64(0), v1], add578_red_temp_v0_shared[v0, T.int64(0)], add578_red_temp_v1_shared[v0, T.int64(0)], model_decoder_layers_0_self_attn_layer_norm_weight3[v1], model_decoder_layers_0_self_attn_layer_norm_bias3[v1]) T.writes(T_layer_norm[v0, T.int64(0), v1]) T_layer_norm[v0, T.int64(0), v1] = T.Cast("float16", (T.Cast("float32", add578[v0, T.int64(0), v1]) - add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004)) * T.rsqrt(add578_red_temp_v1_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004) - add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004) * (add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight3[v1] + model_decoder_layers_0_self_attn_layer_norm_bias3[v1] @T.prim_func def layer_norm1(var_add: T.handle, model_encoder_layers_0_self_attn_layer_norm_weight: T.Buffer((T.int64(1280),), "float16"), model_encoder_layers_0_self_attn_layer_norm_bias: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() add = T.match_buffer(var_add, (batch_size, T.int64(1500), T.int64(1280)), "float16") T_layer_norm = T.match_buffer(var_T_layer_norm, (batch_size, T.int64(1500), T.int64(1280)), "float16") # with T.block("root"): add_red_temp_v0_shared = T.alloc_buffer((batch_size, T.int64(1500)), scope="shared") add_red_temp_v1_shared = T.alloc_buffer((batch_size, T.int64(1500)), scope="shared") for ax0_ax1_fused in T.thread_binding(batch_size * T.int64(1500), thread="blockIdx.x"): for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax2_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("add_red_temp"): v0 = T.axis.spatial(batch_size, ax0_ax1_fused // T.int64(1500) + ax0) v1 = T.axis.spatial(T.int64(1500), ax0_ax1_fused % T.int64(1500) + ax1) v2 = T.axis.reduce(T.int64(1280), ax2_fused_0 * T.int64(256) + ax2_fused_1) T.reads(add[v0, v1, v2]) T.writes(add_red_temp_v0_shared[v0, v1], add_red_temp_v1_shared[v0, v1]) with T.init(): add_red_temp_v0_shared[v0, v1] = T.float32(0) add_red_temp_v1_shared[v0, v1] = T.float32(0) v_add_red_temp_v0: T.float32 = add_red_temp_v0_shared[v0, v1] + T.Cast("float32", add[v0, v1, v2]) v_add_red_temp_v1: T.float32 = add_red_temp_v1_shared[v0, v1] + T.Cast("float32", add[v0, v1, v2]) * T.Cast("float32", add[v0, v1, v2]) add_red_temp_v0_shared[v0, v1] = v_add_red_temp_v0 add_red_temp_v1_shared[v0, v1] = v_add_red_temp_v1 for ax2_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax2_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("T_layer_norm"): v0 = T.axis.spatial(batch_size, ax0_ax1_fused // T.int64(1500)) v1 = T.axis.spatial(T.int64(1500), ax0_ax1_fused % T.int64(1500)) v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1) T.reads(add[v0, v1, v2], add_red_temp_v0_shared[v0, v1], add_red_temp_v1_shared[v0, v1], model_encoder_layers_0_self_attn_layer_norm_weight[v2], model_encoder_layers_0_self_attn_layer_norm_bias[v2]) T.writes(T_layer_norm[v0, v1, v2]) T_layer_norm[v0, v1, v2] = T.Cast("float16", (T.Cast("float32", add[v0, v1, v2]) - add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004)) * T.rsqrt(add_red_temp_v1_shared[v0, v1] * T.float32(0.00078125000000000004) - add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004) * (add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_encoder_layers_0_self_attn_layer_norm_weight[v2] + model_encoder_layers_0_self_attn_layer_norm_bias[v2] @T.prim_func def layer_norm2(var_add257: T.handle, model_decoder_layers_0_self_attn_layer_norm_weight2: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias2: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() add257 = T.match_buffer(var_add257, (T.int64(1), seq_len, T.int64(1280)), "float16") T_layer_norm = T.match_buffer(var_T_layer_norm, (T.int64(1), seq_len, T.int64(1280)), "float16") # with T.block("root"): add257_red_temp_v0_shared = T.alloc_buffer((T.int64(1), seq_len), scope="shared") add257_red_temp_v1_shared = T.alloc_buffer((T.int64(1), seq_len), scope="shared") for ax0_fused in T.thread_binding(seq_len, thread="blockIdx.x"): for ax0 in range(T.int64(1)): for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("add257_red_temp"): v0 = T.axis.spatial(seq_len, ax0_fused + ax0) v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1) T.reads(add257[T.int64(0), v0, v1]) T.writes(add257_red_temp_v0_shared[T.int64(0), v0], add257_red_temp_v1_shared[T.int64(0), v0]) with T.init(): add257_red_temp_v0_shared[T.int64(0), v0] = T.float32(0) add257_red_temp_v1_shared[T.int64(0), v0] = T.float32(0) v_add257_red_temp_v0: T.float32 = add257_red_temp_v0_shared[T.int64(0), v0] + T.Cast("float32", add257[T.int64(0), v0, v1]) v_add257_red_temp_v1: T.float32 = add257_red_temp_v1_shared[T.int64(0), v0] + T.Cast("float32", add257[T.int64(0), v0, v1]) * T.Cast("float32", add257[T.int64(0), v0, v1]) add257_red_temp_v0_shared[T.int64(0), v0] = v_add257_red_temp_v0 add257_red_temp_v1_shared[T.int64(0), v0] = v_add257_red_temp_v1 for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("T_layer_norm"): v0 = T.axis.spatial(seq_len, ax0_fused) v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1) T.reads(add257[T.int64(0), v0, v1], add257_red_temp_v0_shared[T.int64(0), v0], add257_red_temp_v1_shared[T.int64(0), v0], model_decoder_layers_0_self_attn_layer_norm_weight2[v1], model_decoder_layers_0_self_attn_layer_norm_bias2[v1]) T.writes(T_layer_norm[T.int64(0), v0, v1]) T_layer_norm[T.int64(0), v0, v1] = T.Cast("float16", (T.Cast("float32", add257[T.int64(0), v0, v1]) - add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004)) * T.rsqrt(add257_red_temp_v1_shared[T.int64(0), v0] * T.float32(0.00078125000000000004) - add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004) * (add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight2[v1] + model_decoder_layers_0_self_attn_layer_norm_bias2[v1] @T.prim_func def layer_norm3(add1220: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_layer_norm_weight5: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias5: T.Buffer((T.int64(1280),), "float16"), T_layer_norm: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): add1220_red_temp_v0_shared = T.alloc_buffer((T.int64(1), T.int64(1)), scope="shared") add1220_red_temp_v1_shared = T.alloc_buffer((T.int64(1), T.int64(1)), scope="shared") for ax0_fused in T.thread_binding(T.int64(1), thread="blockIdx.x"): for ax0 in range(T.int64(1)): for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("add1220_red_temp"): v0 = T.axis.spatial(T.int64(1), ax0) v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1) T.reads(add1220[T.int64(0), T.int64(0), v1]) T.writes(add1220_red_temp_v0_shared[T.int64(0), T.int64(0)], add1220_red_temp_v1_shared[T.int64(0), T.int64(0)]) with T.init(): add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] = T.float32(0) add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] = T.float32(0) v_add1220_red_temp_v0: T.float32 = add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] + T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) v_add1220_red_temp_v1: T.float32 = add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] + T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) * T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] = v_add1220_red_temp_v0 add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] = v_add1220_red_temp_v1 for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): with T.block("T_layer_norm"): v0 = T.axis.spatial(T.int64(1), T.int64(0)) v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1) T.reads(add1220[T.int64(0), T.int64(0), v1], add1220_red_temp_v0_shared[T.int64(0), T.int64(0)], add1220_red_temp_v1_shared[T.int64(0), T.int64(0)], model_decoder_layers_0_self_attn_layer_norm_weight5[v1], model_decoder_layers_0_self_attn_layer_norm_bias5[v1]) T.writes(T_layer_norm[T.int64(0), T.int64(0), v1]) T_layer_norm[T.int64(0), T.int64(0), v1] = T.Cast("float16", (T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) - add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004)) * T.rsqrt(add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004) - add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004) * (add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight5[v1] + model_decoder_layers_0_self_attn_layer_norm_bias5[v1] @T.prim_func def merge_state_inplace(v: T.handle, s: T.handle, v_other: T.handle, s_other: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) N, H, D = T.int32(is_size_var=True), T.int32(is_size_var=True), T.int32(is_size_var=True) V = T.match_buffer(v, (N, H, D), "float16") S = T.match_buffer(s, (N, H)) V_other = T.match_buffer(v_other, (N, H, D), "float16") S_other = T.match_buffer(s_other, (N, H)) # with T.block("root"): for bx in T.thread_binding(N, thread="blockIdx.x"): for by in T.thread_binding(1, thread="blockIdx.y"): for ty in T.thread_binding(20, thread="threadIdx.y"): for tx in T.thread_binding(16, thread="threadIdx.x"): with T.block("merge"): T.reads(S[bx, ty + by * 20], S_other[bx, ty + by * 20], V[bx, ty + by * 20, tx * 4:tx * 4 + 4], V_other[bx, ty + by * 20, tx * 4:tx * 4 + 4]) T.writes(V[bx, ty + by * 20, tx * 4:tx * 4 + 4], S[bx, ty + by * 20]) s_val = T.alloc_buffer((1,), scope="local") s_other_val = T.alloc_buffer((1,), scope="local") s_max = T.alloc_buffer((1,), scope="local") scale = T.alloc_buffer((1,), scope="local") other_scale = T.alloc_buffer((1,), scope="local") v_vec = T.alloc_buffer((4,), "float16", scope="local") v_other_vec = T.alloc_buffer((4,), "float16", scope="local") s_val[0] = S[bx, ty + by * 20] s_other_val[0] = S_other[bx, ty + by * 20] s_max[0] = T.max(s_val[0], s_other_val[0]) s_val[0] = T.exp2(s_val[0] - s_max[0]) s_other_val[0] = T.exp2(s_other_val[0] - s_max[0]) scale[0] = s_val[0] / (s_val[0] + s_other_val[0]) other_scale[0] = s_other_val[0] / (s_val[0] + s_other_val[0]) for vec in T.vectorized(4): v_vec[vec] = V[bx, ty + by * 20, tx * 4 + vec] for vec in T.vectorized(4): v_other_vec[vec] = V_other[bx, ty + by * 20, tx * 4 + vec] for vec in range(4): v_vec[vec] = T.Cast("float16", T.Cast("float32", v_vec[vec]) * scale[0] + T.Cast("float32", v_other_vec[vec]) * other_scale[0]) for vec in T.vectorized(4): V[bx, ty + by * 20, tx * 4 + vec] = v_vec[vec] S[bx, ty + by * 20] = T.log2(s_val[0] + s_other_val[0]) + s_max[0] @T.prim_func def parallel_sampling_from_prob(var_prob: T.handle, var_uniform_samples: T.handle, var_row_indices: T.handle, var_sampled_token_ids: T.handle): T.func_attr({"tir.is_scheduled": 1}) n, vocab_size = T.int64(), T.int64() prob = T.match_buffer(var_prob, (n, vocab_size)) batch_size = T.int64() uniform_samples = T.match_buffer(var_uniform_samples, (batch_size, 1)) row_indices = T.match_buffer(var_row_indices, (batch_size, 1), "int32") token_ids = T.match_buffer(var_sampled_token_ids, (batch_size, 1), "int32") # with T.block("root"): aggregate = T.alloc_buffer((), scope="local") sample_id_local = T.alloc_buffer((), "int32", scope="local") step_iter = T.alloc_buffer((), "int32", scope="local") for bx in T.thread_binding(batch_size, thread="blockIdx.x"): row_idx: T.int32 = row_indices[bx, 0] for ty in T.thread_binding(T.int64(4), thread="threadIdx.y"): for tx in T.thread_binding(T.int64(32), thread="threadIdx.x"): u: T.float32 = uniform_samples[bx, 0] aggregate[()] = T.Cast("float32", 0) step_iter[()] = 0 while T.tvm_thread_invariant((step_iter[()] == 0 or aggregate[()] < u - T.float32(9.9999999999999995e-07)) and T.Cast("int64", step_iter[()]) < (vocab_size + T.int64(512) - T.int64(1)) // T.int64(512)): with T.block(""): T.reads(step_iter[()], prob[row_idx, T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4):T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + T.int64(4)], aggregate[()]) T.writes(sample_id_local[()], aggregate[()]) prob_gt_threshold = T.alloc_buffer((T.int64(4),), scope="local") cumsum = T.alloc_buffer((T.int64(512),), scope="shared") greater_than_u = T.alloc_buffer((T.int64(4),), "bool", scope="local") mask = T.alloc_buffer((T.int64(4),), "bool", scope="local") valid = T.alloc_buffer((T.int64(4),), "bool", scope="local") indices = T.alloc_buffer((T.int64(4),), "int32", scope="local") step_aggregate = T.alloc_buffer((), scope="local") for v in T.unroll(T.int64(4)): idx: T.int64 = T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + v prob_local: T.float32 = T.if_then_else(idx < vocab_size, prob[row_idx, idx], T.Cast("float32", 0)) prob_gt_threshold[v] = T.if_then_else(prob_local > T.float32(0), prob_local, T.Cast("float32", 0)) valid[v] = prob_local > T.float32(0) and idx < vocab_size with T.block(""): T.reads(prob_gt_threshold[T.int64(0):T.int64(4)]) T.writes(step_aggregate[()]) local_sum = T.alloc_buffer((), scope="local") shared_buf = T.alloc_buffer((T.int64(128),), scope="shared") idx: T.int64 = ty * T.int64(32) + tx local_sum[()] = T.Cast("float32", 0) for i in T.unroll(T.int64(4)): local_sum[()] = local_sum[()] + prob_gt_threshold[i] shared_buf[idx] = local_sum[()] for i in T.unroll(T.int64(7)): if idx % T.shift_left(T.int64(1), i + T.int64(1)) == T.int64(0): shared_buf[idx] = shared_buf[idx] + shared_buf[idx + T.shift_left(T.int64(1), i)] step_aggregate[()] = shared_buf[0] if T.tvm_thread_invariant(aggregate[()] + step_aggregate[()] >= u - T.float32(9.9999999999999995e-07)): for i in T.unroll(T.int64(1), T.int64(4)): prob_gt_threshold[i] = prob_gt_threshold[i] + prob_gt_threshold[i - T.int64(1)] for i in T.vectorized(T.int64(4)): cumsum[ty * T.int64(128) + tx * T.int64(4) + i] = prob_gt_threshold[i] for i in T.unroll(T.int64(5)): for j in T.vectorized(T.int64(4)): idx: T.int64 = ty * T.int64(128) + tx * T.int64(4) if tx >= T.shift_left(T.int64(1), i): cumsum[idx + j] = cumsum[idx + j] + cumsum[idx - T.shift_left(T.int64(1), i) * T.int64(4) + T.int64(4) - T.int64(1)] for i in T.unroll(T.int64(1), T.int64(4)): for j in T.vectorized(T.int64(4)): if ty == T.int64(0): idx: T.int64 = i * T.int64(128) + tx * T.int64(4) cumsum[idx + j] = cumsum[idx + j] + cumsum[i * T.int64(128) - T.int64(1)] for v in T.unroll(T.int64(4)): greater_than_u[v] = cumsum[ty * T.int64(128) + tx * T.int64(4) + v] + aggregate[()] >= u - T.float32(9.9999999999999995e-07) with T.block(""): T.reads(greater_than_u[T.int64(0):T.int64(4)]) T.writes(mask[T.int64(0):T.int64(4)]) shared_buf = T.alloc_buffer((T.int64(128),), "bool", scope="shared") tx_idx: T.int64 = ty * T.int64(32) + tx shared_buf[tx_idx] = greater_than_u[T.int64(3)] mask[0] = T.if_then_else(tx_idx != T.int64(0), T.Cast("int8", greater_than_u[0]) != T.Cast("int8", shared_buf[tx_idx - T.int64(1)]), greater_than_u[0]) for i in T.unroll(T.int64(1), T.int64(4)): mask[i] = T.Cast("int8", greater_than_u[i]) != T.Cast("int8", greater_than_u[i - T.int64(1)]) for v in T.unroll(T.int64(4)): mask[v] = mask[v] and valid[v] indices[v] = T.Cast("int32", T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + v) with T.block(""): T.reads(mask[T.int64(0):T.int64(4)], indices[T.int64(0):T.int64(4)]) T.writes(sample_id_local[()]) local_sum = T.alloc_buffer((), "int32", scope="local") shared_buf = T.alloc_buffer((T.int64(128),), "int32", scope="shared") idx: T.int64 = ty * T.int64(32) + tx local_sum[()] = T.Cast("int32", vocab_size - T.int64(1)) for i in T.unroll(T.int64(4)): if mask[i]: local_sum[()] = T.min(local_sum[()], indices[i]) shared_buf[idx] = local_sum[()] for i in T.unroll(T.int64(7)): if idx % T.shift_left(T.int64(1), i + T.int64(1)) == T.int64(0): shared_buf[idx] = T.min(shared_buf[idx], shared_buf[idx + T.shift_left(T.int64(1), i)]) sample_id_local[()] = shared_buf[0] aggregate[()] = aggregate[()] + step_aggregate[()] step_iter[()] = step_iter[()] + 1 if tx == T.int64(0) and ty == T.int64(0): token_ids[bx, 0] = sample_id_local[()] @T.prim_func def reshape(var_lv: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() lv = T.match_buffer(var_lv, (batch_size, T.int64(1500), T.int64(1280)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_ax3_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) // T.int64(1920000)) v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1920000) // T.int64(1280)) v2 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1280) // T.int64(64)) v3 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(64)) T.reads(lv[v0, v1, v2 * T.int64(64) + v3]) T.writes(T_reshape[v0, v1, v2, v3]) T_reshape[v0, v1, v2, v3] = lv[v0, v1, v2 * T.int64(64) + v3] @T.prim_func def reshape1(var_reshape256: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() reshape256 = T.match_buffer(var_reshape256, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size * T.int64(1500), T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size * T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.reads(reshape256[v0 // T.int64(1500), v0 % T.int64(1500), v1, v2]) T.writes(T_reshape[v0, v1, v2]) T_reshape[v0, v1, v2] = reshape256[v0 // T.int64(1500), v0 % T.int64(1500), v1, v2] @T.prim_func def reshape10(var_lv4: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() lv4 = T.match_buffer(var_lv4, (batch_size * T.int64(1500), T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_ax3_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) // T.int64(1920000)) v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1920000) // T.int64(1280)) v2 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1280) // T.int64(64)) v3 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(64)) T.reads(lv4[v0 * T.int64(1500) + v1, v2, v3]) T.writes(T_reshape[v0, v1, v2, v3]) T_reshape[v0, v1, v2, v3] = lv4[v0 * T.int64(1500) + v1, v2, v3] @T.prim_func def reshape11(var_reshape6: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() reshape6 = T.match_buffer(var_reshape6, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) T.reads(reshape6[v0, v1, v2 // T.int64(64), v2 % T.int64(64)]) T.writes(T_reshape[v0, v1, v2]) T_reshape[v0, v1, v2] = reshape6[v0, v1, v2 // T.int64(64), v2 % T.int64(64)] @T.prim_func def reshape12(var_input_ids: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() input_ids = T.match_buffer(var_input_ids, (T.int64(1), seq_len), "int32") T_reshape = T.match_buffer(var_T_reshape, (seq_len,), "int32") # with T.block("root"): for ax0_fused_0 in T.thread_binding((seq_len + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(seq_len, ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < seq_len) T.reads(input_ids[T.int64(0), v0]) T.writes(T_reshape[v0]) T_reshape[v0] = input_ids[T.int64(0), v0] @T.prim_func def reshape13(var_take: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() take = T.match_buffer(var_take, (seq_len, T.int64(1280)), "float16") T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280)) T.reads(take[v0, v1]) T.writes(T_reshape[T.int64(0), v0, v1]) T_reshape[T.int64(0), v0, v1] = take[v0, v1] @T.prim_func def reshape14(var_lv416: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() lv416 = T.match_buffer(var_lv416, (T.int64(1), seq_len, T.int64(1280)), "float16") T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280)) T.reads(lv416[T.int64(0), v0, v1 * T.int64(64) + v2]) T.writes(T_reshape[T.int64(0), v0, v1, v2]) T_reshape[T.int64(0), v0, v1, v2] = lv416[T.int64(0), v0, v1 * T.int64(64) + v2] @T.prim_func def reshape15(var_concat: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() concat = T.match_buffer(var_concat, (T.int64(1), seq_len, T.int64(60), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (seq_len, T.int64(60), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(3840)) T.reads(concat[T.int64(0), v0, v1, v2]) T.writes(T_reshape[v0, v1, v2]) T_reshape[v0, v1, v2] = concat[T.int64(0), v0, v1, v2] @T.prim_func def reshape16(var_lv69: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() lv69 = T.match_buffer(var_lv69, (seq_len, T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280)) T.reads(lv69[v0, v1, v2]) T.writes(T_reshape[T.int64(0), v0, v1, v2]) T_reshape[T.int64(0), v0, v1, v2] = lv69[v0, v1, v2] @T.prim_func def reshape17(var_reshape391: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() reshape391 = T.match_buffer(var_reshape391, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280)) T.reads(reshape391[T.int64(0), v0, v1 // T.int64(64), v1 % T.int64(64)]) T.writes(T_reshape[T.int64(0), v0, v1]) T_reshape[T.int64(0), v0, v1] = reshape391[T.int64(0), v0, v1 // T.int64(64), v1 % T.int64(64)] @T.prim_func def reshape18(var_reshape393: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() reshape393 = T.match_buffer(var_reshape393, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (seq_len, T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280)) T.reads(reshape393[T.int64(0), v0, v1, v2]) T.writes(T_reshape[v0, v1, v2]) T_reshape[v0, v1, v2] = reshape393[T.int64(0), v0, v1, v2] @T.prim_func def reshape19(input_ids: T.Buffer((T.int64(1), T.int64(1)), "int32"), T_reshape: T.Buffer((T.int64(1),), "int32")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_fused_0 in T.thread_binding(T.int64(1), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(T.int64(1), T.int64(0)) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1)) T.reads(input_ids[T.int64(0), T.int64(0)]) T.writes(T_reshape[T.int64(0)]) T_reshape[T.int64(0)] = input_ids[T.int64(0), T.int64(0)] @T.prim_func def reshape2(var_input_ids: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() input_ids = T.match_buffer(var_input_ids, (batch_size, T.int64(1)), "int32") T_reshape = T.match_buffer(var_T_reshape, (batch_size,), "int32") # with T.block("root"): for ax0_fused_0 in T.thread_binding((batch_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < batch_size) T.reads(input_ids[v0, T.int64(0)]) T.writes(T_reshape[v0]) T_reshape[v0] = input_ids[v0, T.int64(0)] @T.prim_func def reshape3(var_take3: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() take3 = T.match_buffer(var_take3, (batch_size, T.int64(1280)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) T.reads(take3[v0, v1]) T.writes(T_reshape[v0, T.int64(0), v1]) T_reshape[v0, T.int64(0), v1] = take3[v0, v1] @T.prim_func def reshape4(var_lv224: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() lv224 = T.match_buffer(var_lv224, (batch_size, T.int64(1), T.int64(1280)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280)) T.reads(lv224[v0, T.int64(0), v1 * T.int64(64) + v2]) T.writes(T_reshape[v0, T.int64(0), v1, v2]) T_reshape[v0, T.int64(0), v1, v2] = lv224[v0, T.int64(0), v1 * T.int64(64) + v2] @T.prim_func def reshape5(var_concat32: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() concat32 = T.match_buffer(var_concat32, (batch_size, T.int64(1), T.int64(60), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(60), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(3840)) T.reads(concat32[v0, T.int64(0), v1, v2]) T.writes(T_reshape[v0, v1, v2]) T_reshape[v0, v1, v2] = concat32[v0, T.int64(0), v1, v2] @T.prim_func def reshape6(var_lv134: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() lv134 = T.match_buffer(var_lv134, (batch_size, T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280)) T.reads(lv134[v0, v1, v2]) T.writes(T_reshape[v0, T.int64(0), v1, v2]) T_reshape[v0, T.int64(0), v1, v2] = lv134[v0, v1, v2] @T.prim_func def reshape7(var_reshape714: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() reshape714 = T.match_buffer(var_reshape714, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) T.reads(reshape714[v0, T.int64(0), v1 // T.int64(64), v1 % T.int64(64)]) T.writes(T_reshape[v0, T.int64(0), v1]) T_reshape[v0, T.int64(0), v1] = reshape714[v0, T.int64(0), v1 // T.int64(64), v1 % T.int64(64)] @T.prim_func def reshape8(var_reshape716: T.handle, var_T_reshape: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() reshape716 = T.match_buffer(var_reshape716, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(20), T.int64(64)), "float16") # with T.block("root"): for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_reshape"): v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280)) T.reads(reshape716[v0, T.int64(0), v1, v2]) T.writes(T_reshape[v0, v1, v2]) T_reshape[v0, v1, v2] = reshape716[v0, T.int64(0), v1, v2] @T.prim_func def sampler_take_probs_tir(var_unsorted_probs: T.handle, var_sorted_indices: T.handle, var_sample_indices: T.handle, var_sampling_results: T.handle, var_top_prob_offsets: T.handle, var_sampled_values: T.handle, var_top_prob_probs: T.handle, var_top_prob_indices: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) unsorted_probs = T.match_buffer(var_unsorted_probs, (batch_size, vocab_size)) sorted_indices = T.match_buffer(var_sorted_indices, (batch_size, vocab_size), "int32") num_samples = T.int32(is_size_var=True) sample_indices = T.match_buffer(var_sample_indices, (num_samples,), "int32") sampling_results = T.match_buffer(var_sampling_results, (num_samples,), "int32") num_positions = T.int32(is_size_var=True) top_prob_offsets = T.match_buffer(var_top_prob_offsets, (num_positions,), "int32") sampled_values = T.match_buffer(var_sampled_values, (num_samples,)) top_prob_probs = T.match_buffer(var_top_prob_probs, (num_positions,)) top_prob_indices = T.match_buffer(var_top_prob_indices, (num_positions,), "int32") # with T.block("root"): for ax0_fused_0 in T.thread_binding((num_positions + num_samples + 1023) // 1024, thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): with T.block("block"): v0 = T.axis.spatial(num_positions + num_samples, ax0_fused_0 * 1024 + ax0_fused_1) T.where(ax0_fused_0 * 1024 + ax0_fused_1 < num_positions + num_samples) T.reads(top_prob_offsets[v0], sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], unsorted_probs[T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 + (0 - num_positions)]):T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 + (0 - num_positions)]) + (T.max(top_prob_offsets[v0] // vocab_size, sample_indices[v0 - num_positions]) + 1 - T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 - num_positions])), T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 + (0 - num_positions)]):T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 + (0 - num_positions)]) + (T.max(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 - num_positions]) + 1 - T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 - num_positions]))], sample_indices[v0 + (0 - num_positions)], sampling_results[v0 + (0 - num_positions)]) T.writes(top_prob_indices[v0], top_prob_probs[v0], sampled_values[v0 + (0 - num_positions)]) if v0 < num_positions: row: T.int32 = top_prob_offsets[v0] // vocab_size col: T.int32 = top_prob_offsets[v0] % vocab_size top_prob_indices[v0] = sorted_indices[row, col] top_prob_probs[v0] = unsorted_probs[row, sorted_indices[row, col]] else: vj: T.int32 = v0 - num_positions sampled_values[vj] = unsorted_probs[sample_indices[vj], sampling_results[vj]] @T.prim_func def scatter_probs(var_src: T.handle, var_indices: T.handle, var_dst: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size, n = T.int32(is_size_var=True), T.int32(is_size_var=True) src = T.match_buffer(var_src, (batch_size, n)) indices = T.match_buffer(var_indices, (batch_size,), "int32") m = T.int32(is_size_var=True) dst = T.match_buffer(var_dst, (m, n)) # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * n + 1023) // 1024, thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): with T.block("scatter_2d"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % (n * batch_size) // n) v1 = T.axis.spatial(n, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % n) T.where(ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1 < batch_size * n) T.reads(src[v0, v1], indices[v0]) T.writes(dst[indices[v0], v1]) dst[indices[v0], v1] = src[v0, v1] @T.prim_func def shape_func(H: T.Buffer((T.int64(2),), "int64")): T.func_attr({"tir.is_host_func": 1}) H[T.int64(1)] = H[T.int64(0)] * T.int64(1500) @T.prim_func def shape_func1(H: T.Buffer((T.int64(3),), "int64")): T.func_attr({"tir.is_host_func": 1}) H[T.int64(1)] = H[T.int64(0)] * T.int64(1500) @T.prim_func def shape_func2(H: T.Buffer((T.int64(5),), "int64")): T.func_attr({"tir.is_host_func": 1}) H[T.int64(4)] = T.int64(8) * H[T.int64(1)] * T.int64(4) H[T.int64(3)] = T.int64(8) * (H[T.int64(0)] * H[T.int64(1)] * T.int64(4)) + T.int64(8388608) + H[T.int64(0)] * H[T.int64(1)] * T.int64(12) H[T.int64(2)] = T.int64(8) * H[T.int64(1)] * T.int64(4) * T.int64(8) + T.int64(8388608) + T.int64(8) * H[T.int64(1)] * T.int64(12) @T.prim_func def shape_func3(H: T.Buffer((T.int64(6),), "int64")): T.func_attr({"tir.is_host_func": 1}) H[T.int64(4)] = T.int64(8) * (H[T.int64(0)] * H[T.int64(1)] * T.int64(4)) + T.int64(8388608) + H[T.int64(0)] * H[T.int64(1)] * T.int64(12) H[T.int64(3)] = T.int64(8) * H[T.int64(1)] * T.int64(4) * T.int64(8) + T.int64(8388608) + T.int64(8) * H[T.int64(1)] * T.int64(12) H[T.int64(5)] = T.int64(32) * H[T.int64(1)] @T.prim_func def shape_func4(H: T.Buffer((T.int64(3),), "int64")): T.func_attr({"tir.is_host_func": 1}) H[T.int64(2)] = T.int64(8) * H[T.int64(1)] * T.int64(4) @T.prim_func def shape_func5(H: T.Buffer((T.int64(5),), "int64")): T.func_attr({"tir.is_host_func": 1}) H[T.int64(2)] = T.int64(32) * ((H[T.int64(1)] + T.int64(4096) - T.int64(1)) // T.int64(4096)) H[T.int64(4)] = T.int64(32) * H[T.int64(1)] H[T.int64(3)] = (H[T.int64(1)] + T.int64(4096) - T.int64(1)) // T.int64(4096) @T.prim_func def softmax_with_chunked_sum(var_A: T.handle, var_temperature: T.handle, var_chunked_sum: T.handle, var_chunked_max: T.handle, var_softmax: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True) A = T.match_buffer(var_A, (batch_size, vocab_size)) temperature = T.match_buffer(var_temperature, (batch_size,)) num_chunks = T.int64(is_size_var=True) chunked_sum = T.match_buffer(var_chunked_sum, (batch_size, num_chunks)) chunked_max = T.match_buffer(var_chunked_max, (batch_size, num_chunks)) softmax = T.match_buffer(var_softmax, (batch_size, vocab_size)) # with T.block("root"): temp_max_shared = T.alloc_buffer((batch_size,), scope="shared") temp_sum_shared = T.alloc_buffer((batch_size,), scope="shared") for l0_l1_fused in T.thread_binding(batch_size * num_chunks, thread="blockIdx.x"): for ax0_1 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax0_0 in T.serial((num_chunks + T.int64(31)) // T.int64(32), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}): with T.block("max"): v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks) v1 = T.axis.reduce(num_chunks, ax0_0 * T.int64(32) + ax0_1) T.where(ax0_0 * T.int64(32) + ax0_1 < num_chunks) T.reads(chunked_max[v0, v1]) T.writes(temp_max_shared[v0]) with T.init(): temp_max_shared[v0] = T.float32(-3.4028234663852886e+38) temp_max_shared[v0] = T.max(temp_max_shared[v0], chunked_max[v0, v1]) for ax0_1 in T.thread_binding(T.int64(32), thread="threadIdx.x"): for ax0_0 in T.serial((num_chunks + T.int64(31)) // T.int64(32), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}): with T.block("sum_exp"): v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks) v1 = T.axis.reduce(num_chunks, ax0_0 * T.int64(32) + ax0_1) T.where(ax0_0 * T.int64(32) + ax0_1 < num_chunks) T.reads(temperature[v0], chunked_sum[v0, v1], chunked_max[v0, v1], temp_max_shared[v0]) T.writes(temp_sum_shared[v0]) with T.init(): temp_sum_shared[v0] = T.float32(0) temp_sum_shared[v0] = temp_sum_shared[v0] + T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(chunked_sum[v0, v1] + chunked_max[v0, v1] - temp_max_shared[v0]), T.Cast("float32", chunked_max[v0, v1] == temp_max_shared[v0]) * chunked_sum[v0, v1]) for l2_0 in T.serial(T.int64(4), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}): for l2_1 in T.thread_binding(T.int64(32), thread="threadIdx.y"): for l2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): with T.block("log_pad"): v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks) v1 = T.axis.spatial(num_chunks, l0_l1_fused % num_chunks) v2 = T.axis.spatial(T.int64(4096), l2_0 * T.int64(1024) + l2_1 * T.int64(32) + l2_2) T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2], temp_sum_shared[v0], temp_max_shared[v0]) T.writes(softmax[v0, v1 * T.int64(4096) + v2]) if v1 * T.int64(4096) + v2 < vocab_size: softmax[v0, v1 * T.int64(4096) + v2] = T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(A[v0, v1 * T.int64(4096) + v2] / temperature[v0] - (T.log(temp_sum_shared[v0]) + temp_max_shared[v0])), T.Cast("float32", A[v0, v1 * T.int64(4096) + v2] == temp_max_shared[v0]) / temp_sum_shared[v0]) @T.prim_func def take(model_decoder_embed_tokens_weight3: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), var_reshape707: T.handle, var_T_take: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() reshape707 = T.match_buffer(var_reshape707, (batch_size,), "int32") T_take = T.match_buffer(var_T_take, (batch_size, T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_take"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) T.reads(model_decoder_embed_tokens_weight3[reshape707[v0], v1], reshape707[v0]) T.writes(T_take[v0, v1]) T_take[v0, v1] = model_decoder_embed_tokens_weight3[reshape707[v0], v1] @T.prim_func def take1(model_decoder_embed_positions_weight3: T.Buffer((T.int64(448), T.int64(1280)), "float16"), var_lv133: T.handle, var_T_take: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size = T.int64() lv133 = T.match_buffer(var_lv133, (batch_size,), "int32") T_take = T.match_buffer(var_T_take, (batch_size, T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_take"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) T.reads(model_decoder_embed_positions_weight3[lv133[v0], v1], lv133[v0]) T.writes(T_take[v0, v1]) T_take[v0, v1] = model_decoder_embed_positions_weight3[lv133[v0], v1] @T.prim_func def take2(var_layer_norm161: T.handle, var_logit_positions: T.handle, var_T_take: T.handle): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) seq_len = T.int64() layer_norm161 = T.match_buffer(var_layer_norm161, (T.int64(1), seq_len, T.int64(1280)), "float16") batch_size = T.int64() logit_positions = T.match_buffer(var_logit_positions, (batch_size,), "int32") T_take = T.match_buffer(var_T_take, (T.int64(1), batch_size, T.int64(1280)), "float16") # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_take"): v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) T.reads(layer_norm161[T.int64(0), logit_positions[v0], v1], logit_positions[v0]) T.writes(T_take[T.int64(0), v0, v1]) T_take[T.int64(0), v0, v1] = layer_norm161[T.int64(0), logit_positions[v0], v1] @T.prim_func def take3(model_decoder_embed_tokens_weight5: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), reshape1353: T.Buffer((T.int64(1),), "int32"), T_take: T.Buffer((T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_take"): v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) T.reads(model_decoder_embed_tokens_weight5[reshape1353[T.int64(0)], v0], reshape1353[T.int64(0)]) T.writes(T_take[T.int64(0), v0]) T_take[T.int64(0), v0] = model_decoder_embed_tokens_weight5[reshape1353[T.int64(0)], v0] @T.prim_func def take4(model_decoder_embed_positions_weight5: T.Buffer((T.int64(448), T.int64(1280)), "float16"), lv264: T.Buffer((T.int64(1),), "int32"), T_take: T.Buffer((T.int64(1), T.int64(1280)), "float16")): T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) # with T.block("root"): for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("T_take"): v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) T.reads(model_decoder_embed_positions_weight5[lv264[T.int64(0)], v0], lv264[T.int64(0)]) T.writes(T_take[T.int64(0), v0]) T_take[T.int64(0), v0] = model_decoder_embed_positions_weight5[lv264[T.int64(0)], v0] @T.prim_func def take_sorted_probs(var_probs: T.handle, var_lv1: T.handle, var_take_sorted_probs: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) batch_size, vocab_size = T.int64(), T.int64() probs = T.match_buffer(var_probs, (batch_size, vocab_size)) lv1 = T.match_buffer(var_lv1, (batch_size, vocab_size), "int32") batch_size_1, vocab_size_1 = T.int64(), T.int64() take_sorted_probs = T.match_buffer(var_take_sorted_probs, (batch_size_1, vocab_size_1)) # with T.block("root"): for ax0_ax1_fused_0 in T.thread_binding((batch_size_1 * vocab_size_1 + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("take_sorted_probs"): v0 = T.axis.spatial(batch_size_1, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size_1 * batch_size_1) // vocab_size_1) v1 = T.axis.spatial(vocab_size_1, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size_1) T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size_1 * vocab_size_1) T.reads(probs[v0, lv1[v0, v1]], lv1[v0, v1]) T.writes(take_sorted_probs[v0, v1]) take_sorted_probs[v0, v1] = probs[v0, lv1[v0, v1]] @T.prim_func def tir_kv_cache_debug_get_kv(var_pages: T.handle, var_position_map: T.handle, var_k_data: T.handle, var_v_data: T.handle, layer_id: T.int64): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) num_pages, page_size = T.int64(), T.int64(is_size_var=True) pages = T.match_buffer(var_pages, (num_pages, 2, 20, page_size, 64), "float16") seqlen = T.int64(is_size_var=True) position_map = T.match_buffer(var_position_map, (seqlen,), "int32", offset_factor=1) k_data = T.match_buffer(var_k_data, (32, seqlen, 20, 64), "float16") v_data = T.match_buffer(var_v_data, (32, seqlen, 20, 64), "float16") # with T.block("root"): for p_h_d_fused_0 in T.thread_binding((seqlen * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for p_h_d_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): with T.block("copy0"): vp = T.axis.spatial(seqlen, (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) // T.int64(1280)) vh = T.axis.spatial(20, T.Cast("int32", (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) % T.int64(1280) // T.int64(64))) vd = T.axis.spatial(64, T.Cast("int32", (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) % T.int64(64))) T.where(p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1 < seqlen * T.int64(1280)) T.reads(position_map[vp], pages[T.Cast("int64", position_map[vp]) // page_size, 0:2, vh, T.Cast("int64", position_map[vp]) % page_size, vd]) T.writes(k_data[layer_id, vp, vh, vd], v_data[layer_id, vp, vh, vd]) position: T.int32 = position_map[vp] k_data[layer_id, vp, vh, vd] = pages[T.Cast("int64", position) // page_size, 0, vh, T.Cast("int64", position) % page_size, vd] v_data[layer_id, vp, vh, vd] = pages[T.Cast("int64", position) // page_size, 1, vh, T.Cast("int64", position) % page_size, vd] @T.prim_func def tir_kv_cache_transpose_append(var_pages: T.handle, var_k_data: T.handle, var_v_data: T.handle, var_position_map: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) num_pages = T.int64() pages = T.match_buffer(var_pages, (num_pages, 2, 20, 16, 64), "float16") ntoken = T.int64(is_size_var=True) k_data = T.match_buffer(var_k_data, (ntoken, 20, 64), "float16") v_data = T.match_buffer(var_v_data, (ntoken, 20, 64), "float16") position_map = T.match_buffer(var_position_map, (ntoken,), "int32", offset_factor=1) # with T.block("root"): for global_pos_h_f_fused_0 in T.thread_binding((ntoken * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): for global_pos_h_f_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): if position_map[(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280)] != -1: with T.block("k_transpose_append"): vgpos = T.axis.spatial(ntoken, (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280)) vh = T.axis.spatial(20, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(1280) // T.int64(64))) vf = T.axis.spatial(64, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(64))) T.where(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1 < ntoken * T.int64(1280)) T.reads(position_map[vgpos], k_data[vgpos, vh, vf]) T.writes(pages[position_map[vgpos] // 16, 0, vh, position_map[vgpos] % 16, vf]) position: T.int32 = position_map[vgpos] pages[position // 16, 0, vh, position % 16, vf] = k_data[vgpos, vh, vf] with T.block("v_transpose_append"): vgpos = T.axis.spatial(ntoken, (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280)) vh = T.axis.spatial(20, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(1280) // T.int64(64))) vf = T.axis.spatial(64, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(64))) T.where(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1 < ntoken * T.int64(1280)) T.reads(position_map[vgpos], v_data[vgpos, vh, vf]) T.writes(pages[position_map[vgpos] // 16, 1, vh, position_map[vgpos] % 16, vf]) position: T.int32 = position_map[vgpos] pages[position // 16, 1, vh, position % 16, vf] = v_data[vgpos, vh, vf] @T.prim_func def top_p_pivot_cutoff(var_prob: T.handle, var_top_p_arr: T.handle, var_init_pivots: T.handle, var_final_pivot: T.handle, var_final_lsum: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) B, N = T.int32(), T.int32() prob = T.match_buffer(var_prob, (B, N)) top_p_arr = T.match_buffer(var_top_p_arr, (B,)) init_pivots = T.match_buffer(var_init_pivots, (B, 3)) final_pivot = T.match_buffer(var_final_pivot, (B,)) final_lsum = T.match_buffer(var_final_lsum, (B,)) # with T.block("root"): pivot = T.alloc_buffer((3,), scope="local") top_p = T.alloc_buffer((1,), scope="local") L = T.alloc_buffer((1,), scope="shared") R_1 = T.alloc_buffer((1,), scope="shared") L_local = T.alloc_buffer((1,), scope="local") R_local = T.alloc_buffer((1,), scope="local") q = T.alloc_buffer((1,), scope="local") lsum = T.alloc_buffer((3,), scope="local") lmin_broadcast = T.alloc_buffer((1,), scope="shared") lmin_broadcast_local = T.alloc_buffer((1,), scope="local") lmin = T.alloc_buffer((3,), scope="local") cmin = T.alloc_buffer((3,), "int32", scope="local") total_sum = T.alloc_buffer((1,), scope="local") it = T.alloc_buffer((1,), "int32", scope="local") es_local = T.alloc_buffer((1,), "bool", scope="local") es = T.alloc_buffer((1,), "bool", scope="shared") find_pivot_local = T.alloc_buffer((1,), "bool", scope="local") find_pivot = T.alloc_buffer((1,), "bool", scope="shared") total_sum_reduce = T.alloc_buffer((1,), scope="local") lsum_reduce = T.alloc_buffer((1,), scope="local") lmin_reduce = T.alloc_buffer((1,), scope="local") cmin_reduce = T.alloc_buffer((1,), "int32", scope="local") for _bx in T.thread_binding(B, thread="blockIdx.x"): for _tx in T.thread_binding(1024, thread="threadIdx.x"): with T.block("CTA"): b, tx = T.axis.remap("SS", [_bx, _tx]) T.reads(top_p_arr[b], top_p[0], L[0], R_1[0], init_pivots[b, 0:3], L_local[0], R_local[0], find_pivot_local[0], it[0], es_local[0], prob[b, it[0] * 1024 + tx], total_sum[0], q[0], pivot[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], lsum[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], lmin[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], cmin[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], total_sum_reduce[0], es[0], lmin_reduce[0], lmin_broadcast[0], lmin_broadcast_local[0], lsum_reduce[0], cmin_reduce[0], find_pivot[0]) T.writes(top_p[0], L[0], R_1[0], find_pivot[0], L_local[0], R_local[0], pivot[0:3], find_pivot_local[0], final_lsum[b], final_pivot[b], lsum[0:3], lmin[0:3], cmin[0:3], total_sum[0], it[0], es_local[0], q[0], total_sum_reduce[0], es[0], lsum_reduce[0], lmin_reduce[0], lmin_broadcast[0], lmin_broadcast_local[0], cmin_reduce[0]) top_p[0] = top_p_arr[b] if tx == 0: L[0] = T.float32(1) - top_p[0] R_1[0] = T.float32(9.9999999999999995e-08) find_pivot[0] = T.bool(False) T.tvm_storage_sync("shared") L_local[0] = L[0] R_local[0] = R_1[0] for i in T.unroll(3): pivot[i] = init_pivots[b, i] find_pivot_local[0] = T.bool(False) if L_local[0] - R_local[0] <= T.float32(9.9999999999999995e-08): if tx == 0: final_lsum[b] = T.float32(1) final_pivot[b] = T.float32(0) find_pivot_local[0] = T.bool(True) while T.tvm_thread_invariant(L_local[0] - R_local[0] > T.float32(9.9999999999999995e-08) and not find_pivot_local[0]): T.tvm_storage_sync("shared") for pidx in T.unroll(3): lsum[pidx] = T.float32(0) lmin[pidx] = T.float32(3.4028234663852886e+38) cmin[pidx] = 0 total_sum[0] = T.float32(0) it[0] = 0 es_local[0] = T.bool(False) while it[0] < (N + 1024 - 1) // 1024 and not es_local[0]: q[0] = T.if_then_else(it[0] * 1024 + tx < N, prob[b, it[0] * 1024 + tx], T.float32(0)) total_sum[0] = total_sum[0] + q[0] for pidx in T.unroll(3): if q[0] >= pivot[pidx]: lsum[pidx] = lsum[pidx] + q[0] if lmin[pidx] > q[0]: lmin[pidx] = q[0] cmin[pidx] = 1 else: if lmin[pidx] == q[0]: cmin[pidx] = cmin[pidx] + 1 it[0] = it[0] + 1 if it[0] % 32 == 0: with T.block("block_cross_thread"): T.reads(total_sum[0]) T.writes(total_sum_reduce[0]) T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) T.tvm_thread_allreduce(T.uint32(1), total_sum[0], T.bool(True), total_sum_reduce[0], tx) if tx == 0: es[0] = T.float32(1) - total_sum_reduce[0] < pivot[2] T.tvm_storage_sync("shared") es_local[0] = es[0] T.tvm_storage_sync("shared") for pidx in range(3): with T.block("block_cross_thread"): T.reads(lsum[pidx]) T.writes(lsum_reduce[0]) T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) T.tvm_thread_allreduce(T.uint32(1), lsum[pidx], T.bool(True), lsum_reduce[0], tx) with T.block("block_cross_thread"): T.reads(lmin[pidx]) T.writes(lmin_reduce[0]) T.attr(T.comm_reducer(lambda x0, y0: T.min(x0, y0), [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) T.tvm_thread_allreduce(T.uint32(1), lmin[pidx], T.bool(True), lmin_reduce[0], tx) if tx == 0: lmin_broadcast[0] = lmin_reduce[0] T.tvm_storage_sync("shared") lmin_broadcast_local[0] = lmin_broadcast[0] if lmin[pidx] > lmin_broadcast_local[0]: cmin[pidx] = 0 if tx == 0: lsum[pidx] = lsum_reduce[0] lmin[pidx] = lmin_reduce[0] with T.block("block_cross_thread"): T.reads(cmin[pidx]) T.writes(cmin_reduce[0]) T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [0]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) T.tvm_thread_allreduce(T.uint32(1), cmin[pidx], T.bool(True), cmin_reduce[0], tx) if tx == 0: cmin[pidx] = cmin_reduce[0] T.tvm_storage_sync("shared") if tx == 0: it[0] = 0 while it[0] < 3 and not find_pivot_local[0]: if lsum[it[0]] >= top_p[0] and top_p[0] > lsum[it[0]] - T.Cast("float32", cmin[it[0]]) * lmin[it[0]]: find_pivot[0] = T.bool(True) find_pivot_local[0] = T.bool(True) final_pivot[b] = pivot[it[0]] final_lsum[b] = lsum[it[0]] else: if lsum[it[0]] - lmin[it[0]] * T.Cast("float32", cmin[it[0]]) >= top_p[0]: R_1[0] = pivot[it[0]] final_lsum[b] = lsum[it[0]] else: if lsum[it[0]] < top_p[0]: L[0] = pivot[it[0]] it[0] = it[0] + 1 T.tvm_storage_sync("shared") L_local[0] = L[0] R_local[0] = R_1[0] find_pivot_local[0] = find_pivot[0] for pidx in T.unroll(3): pivot[pidx] = L[0] - T.Cast("float32", pidx + 1) * (L_local[0] - R_local[0]) / T.float32(4) if tx == 0: if not find_pivot_local[0]: final_pivot[b] = R_local[0] if R_local[0] == T.float32(9.9999999999999995e-08): final_lsum[b] = lsum[2] @T.prim_func def top_p_renorm_after_cutoff(var_prob: T.handle, var_final_pivot: T.handle, var_final_lsum: T.handle, var_renorm_prob: T.handle): T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) B, N = T.int32(), T.int32() prob = T.match_buffer(var_prob, (B, N)) final_pivot = T.match_buffer(var_final_pivot, (B,)) final_lsum = T.match_buffer(var_final_lsum, (B,)) renorm_prob = T.match_buffer(var_renorm_prob, (B, N)) # with T.block("root"): pivot = T.alloc_buffer((1,), scope="local") lsum = T.alloc_buffer((1,), scope="local") for _by in T.thread_binding(B, thread="blockIdx.y"): for _bx in T.thread_binding((B + 511) // B, thread="blockIdx.x"): for _tx in T.thread_binding(1024, thread="threadIdx.x"): with T.block("CTA"): by, bx, tx = T.axis.remap("SSS", [_by, _bx, _tx]) T.reads(final_pivot[by], final_lsum[by], prob[by, T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx:T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx + (T.Select(0 <= (B + 511) // B, (N - 1) // ((B + 511) // B * 1024) * ((B + 511) // B), 0 - (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + 1)], pivot[0], lsum[0]) T.writes(pivot[0], lsum[0], renorm_prob[by, T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx:T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx + (T.Select(0 <= (B + 511) // B, (N - 1) // ((B + 511) // B * 1024) * ((B + 511) // B), 0 - (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + 1)]) pivot[0] = final_pivot[by] lsum[0] = final_lsum[by] for i in range(((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024)): if i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx < N: renorm_prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] = T.if_then_else(prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] >= pivot[0], prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] / lsum[0], T.float32(0)) @R.function def _metadata() -> R.Object: shape_heap: R.Object = R.null_value() return R.str("{\"model_type\": \"whisper\", \"quantization\": \"q0f16\", \"context_window_size\": 1500, \"sliding_window_size\": -1, \"attention_sink_size\": -1, \"prefill_chunk_size\": 15000, \"tensor_parallel_shards\": 1, \"kv_state_kind\": \"kv_cache\", \"max_batch_size\": 8, \"params\": [{\"name\": \"model.encoder.conv1.weight\", \"shape\": [1280, 128, 3], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv1.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv2.weight\", \"shape\": [1280, 1280, 3], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.embed_positions.weight\", \"shape\": [1500, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.embed_tokens.weight\", \"shape\": [51866, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.embed_positions.weight\", \"shape\": [448, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}], \"kv_cache\": {\"num_hidden_layers\": 32, \"num_attention_heads\": 20, \"num_key_value_heads\": 20, \"head_dim\": 64}, \"memory_usage\": {\"argsort_probs\": 0, \"batch_compute_cross_attn_kv\": 61440000, \"batch_decode\": 1987392, \"batch_encode\": 276480000, \"batch_prefill\": 616080192, \"create_tir_paged_kv_cache\": 0, \"decode\": 243304, \"multinomial_from_uniform\": 32, \"prefill\": 614610024, \"renormalize_by_top_p\": 64, \"sample_with_top_p\": 64, \"sampler_take_probs\": 416, \"sampler_verify_draft_tokens\": 0, \"softmax_with_temperature\": 0}}") @R.function def argsort_probs(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32")) -> R.Tuple(R.Tensor(("batch_size", "vocab_size"), dtype="float32"), R.Tensor(("batch_size", "vocab_size"), dtype="int32")): batch_size = T.int64() vocab_size = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=argsort_probs, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) cls.shape_func2(shape_heap) gv2560: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) storage30: R.Object = R.vm.alloc_storage(gv2560, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2561: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) lv: R.Tensor(dtype="uint8", ndim=1) = R.vm.alloc_tensor(storage30, R.prim_value(0), gv2561, R.dtype("uint8")) R.vm.kill_object(storage30) gv2562: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) storage31: R.Object = R.vm.alloc_storage(gv2562, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2563: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc1976: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage31, R.prim_value(0), gv2563, R.dtype("int32")) R.vm.kill_object(storage31) cls.argsort_thrust(probs, lv, alloc1976) R.vm.kill_object(lv) gv2564: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) storage32: R.Object = R.vm.alloc_storage(gv2564, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2565: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc1977: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage32, R.prim_value(0), gv2565, R.dtype("float32")) R.vm.kill_object(storage32) cls.take_sorted_probs(probs, alloc1976, alloc1977) gv1: R.Tuple(R.Tensor(dtype="float32", ndim=2), R.Tensor(dtype="int32", ndim=2)) = alloc1977, alloc1976 R.vm.kill_object(alloc1976) R.vm.kill_object(alloc1977) gv2566: R.Tensor(dtype="float32", ndim=2) = gv1[0] R.call_packed("vm.builtin.match_shape", gv2566, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=return, annotation=R.Tuple(R.Tensor((batch_size, vocab_size), dtype=\"float32\"), R.Tensor((batch_size, vocab_size), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) gv2567: R.Tensor(dtype="int32", ndim=2) = gv1[1] R.call_packed("vm.builtin.match_shape", gv2567, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=return, annotation=R.Tuple(R.Tensor((batch_size, vocab_size), dtype=\"float32\"), R.Tensor((batch_size, vocab_size), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) return gv1 @R.function def batch_compute_cross_attn_kv(encoder_hidden_states: R.Tensor(("batch_size", 1500, 1280), dtype="float16"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Object: batch_size = T.int64() R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", encoder_hidden_states, R.prim_value(3), R.dtype("float16"), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[0], param=encoder_hidden_states, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", encoder_hidden_states, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[0], param=encoder_hidden_states, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) cls.shape_func(shape_heap) model_decoder_layers_0_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[498] storage11: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv883: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc554: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv883, R.dtype("float16")) _552: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_0_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc554) R.vm.kill_object(model_decoder_layers_0_encoder_attn_k_proj_weight1) gv884: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape256: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc554, gv884, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc554) model_decoder_layers_0_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[499] model_decoder_layers_0_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[500] storage12: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv885: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc555: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv885, R.dtype("float16")) _553: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_0_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_0_encoder_attn_v_proj_bias1, alloc555) R.vm.kill_object(model_decoder_layers_0_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_0_encoder_attn_v_proj_bias1) gv886: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape257: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc555, gv886, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc555) gv887: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape258: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape256, gv887, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape256) gv888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape259: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape257, gv888, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape257) lv36: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", paged_kv_cache, R.prim_value(0), reshape258, reshape259, sinfo_args=(R.Object,)) R.vm.kill_object(reshape258) R.vm.kill_object(reshape259) model_decoder_layers_1_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[522] gv889: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc556: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv889, R.dtype("float16")) _554: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_1_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc556) R.vm.kill_object(model_decoder_layers_1_encoder_attn_k_proj_weight1) gv890: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape260: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc556, gv890, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc556) model_decoder_layers_1_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[523] model_decoder_layers_1_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[524] gv891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc557: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv891, R.dtype("float16")) _555: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_1_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_1_encoder_attn_v_proj_bias1, alloc557) R.vm.kill_object(model_decoder_layers_1_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_1_encoder_attn_v_proj_bias1) gv892: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape261: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc557, gv892, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc557) gv893: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape262: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape260, gv893, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape260) gv894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape263: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape261, gv894, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape261) lv37: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv36, R.prim_value(1), reshape262, reshape263, sinfo_args=(R.Object,)) R.vm.kill_object(reshape262) R.vm.kill_object(reshape263) R.vm.kill_object(lv36) model_decoder_layers_2_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[546] gv895: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc558: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv895, R.dtype("float16")) _556: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_2_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc558) R.vm.kill_object(model_decoder_layers_2_encoder_attn_k_proj_weight1) gv896: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape264: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc558, gv896, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc558) model_decoder_layers_2_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[547] model_decoder_layers_2_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[548] gv897: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc559: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv897, R.dtype("float16")) _557: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_2_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_2_encoder_attn_v_proj_bias1, alloc559) R.vm.kill_object(model_decoder_layers_2_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_2_encoder_attn_v_proj_bias1) gv898: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape265: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc559, gv898, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc559) gv899: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape266: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape264, gv899, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape264) gv900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape267: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape265, gv900, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape265) lv38: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv37, R.prim_value(2), reshape266, reshape267, sinfo_args=(R.Object,)) R.vm.kill_object(reshape266) R.vm.kill_object(reshape267) R.vm.kill_object(lv37) model_decoder_layers_3_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[570] gv901: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc560: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv901, R.dtype("float16")) _558: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_3_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc560) R.vm.kill_object(model_decoder_layers_3_encoder_attn_k_proj_weight1) gv902: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape268: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc560, gv902, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc560) model_decoder_layers_3_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[571] model_decoder_layers_3_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[572] gv903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc561: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv903, R.dtype("float16")) _559: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_3_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_3_encoder_attn_v_proj_bias1, alloc561) R.vm.kill_object(model_decoder_layers_3_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_3_encoder_attn_v_proj_bias1) gv904: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape269: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc561, gv904, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc561) gv905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape270: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape268, gv905, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape268) gv906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape271: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape269, gv906, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape269) lv39: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv38, R.prim_value(3), reshape270, reshape271, sinfo_args=(R.Object,)) R.vm.kill_object(reshape270) R.vm.kill_object(reshape271) R.vm.kill_object(lv38) model_decoder_layers_4_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[594] gv907: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc562: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv907, R.dtype("float16")) _560: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_4_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc562) R.vm.kill_object(model_decoder_layers_4_encoder_attn_k_proj_weight1) gv908: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape272: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc562, gv908, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc562) model_decoder_layers_4_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[595] model_decoder_layers_4_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[596] gv909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc563: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv909, R.dtype("float16")) _561: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_4_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_4_encoder_attn_v_proj_bias1, alloc563) R.vm.kill_object(model_decoder_layers_4_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_4_encoder_attn_v_proj_bias1) gv910: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape273: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc563, gv910, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc563) gv911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape274: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape272, gv911, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape272) gv912: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape275: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape273, gv912, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape273) lv40: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv39, R.prim_value(4), reshape274, reshape275, sinfo_args=(R.Object,)) R.vm.kill_object(reshape274) R.vm.kill_object(reshape275) R.vm.kill_object(lv39) model_decoder_layers_5_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[618] gv913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc564: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv913, R.dtype("float16")) _562: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_5_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc564) R.vm.kill_object(model_decoder_layers_5_encoder_attn_k_proj_weight1) gv914: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape276: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc564, gv914, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc564) model_decoder_layers_5_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[619] model_decoder_layers_5_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[620] gv915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc565: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv915, R.dtype("float16")) _563: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_5_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_5_encoder_attn_v_proj_bias1, alloc565) R.vm.kill_object(model_decoder_layers_5_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_5_encoder_attn_v_proj_bias1) gv916: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape277: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc565, gv916, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc565) gv917: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape278: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape276, gv917, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape276) gv918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape279: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape277, gv918, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape277) lv41: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv40, R.prim_value(5), reshape278, reshape279, sinfo_args=(R.Object,)) R.vm.kill_object(reshape278) R.vm.kill_object(reshape279) R.vm.kill_object(lv40) model_decoder_layers_6_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[642] gv919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc566: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv919, R.dtype("float16")) _564: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_6_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc566) R.vm.kill_object(model_decoder_layers_6_encoder_attn_k_proj_weight1) gv920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape280: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc566, gv920, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc566) model_decoder_layers_6_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[643] model_decoder_layers_6_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[644] gv921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc567: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv921, R.dtype("float16")) _565: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_6_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_6_encoder_attn_v_proj_bias1, alloc567) R.vm.kill_object(model_decoder_layers_6_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_6_encoder_attn_v_proj_bias1) gv922: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape281: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc567, gv922, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc567) gv923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape282: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape280, gv923, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape280) gv924: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape283: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape281, gv924, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape281) lv42: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv41, R.prim_value(6), reshape282, reshape283, sinfo_args=(R.Object,)) R.vm.kill_object(reshape282) R.vm.kill_object(reshape283) R.vm.kill_object(lv41) model_decoder_layers_7_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[666] gv925: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc568: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv925, R.dtype("float16")) _566: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_7_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc568) R.vm.kill_object(model_decoder_layers_7_encoder_attn_k_proj_weight1) gv926: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape284: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc568, gv926, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc568) model_decoder_layers_7_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[667] model_decoder_layers_7_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[668] gv927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc569: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv927, R.dtype("float16")) _567: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_7_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_7_encoder_attn_v_proj_bias1, alloc569) R.vm.kill_object(model_decoder_layers_7_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_7_encoder_attn_v_proj_bias1) gv928: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape285: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc569, gv928, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc569) gv929: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape286: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape284, gv929, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape284) gv930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape287: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape285, gv930, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape285) lv43: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv42, R.prim_value(7), reshape286, reshape287, sinfo_args=(R.Object,)) R.vm.kill_object(reshape286) R.vm.kill_object(reshape287) R.vm.kill_object(lv42) model_decoder_layers_8_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[690] gv931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc570: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv931, R.dtype("float16")) _568: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_8_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc570) R.vm.kill_object(model_decoder_layers_8_encoder_attn_k_proj_weight1) gv932: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape288: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc570, gv932, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc570) model_decoder_layers_8_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[691] model_decoder_layers_8_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[692] gv933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc571: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv933, R.dtype("float16")) _569: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_8_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_8_encoder_attn_v_proj_bias1, alloc571) R.vm.kill_object(model_decoder_layers_8_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_8_encoder_attn_v_proj_bias1) gv934: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape289: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc571, gv934, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc571) gv935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape290: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape288, gv935, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape288) gv936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape291: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape289, gv936, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape289) lv44: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv43, R.prim_value(8), reshape290, reshape291, sinfo_args=(R.Object,)) R.vm.kill_object(reshape290) R.vm.kill_object(reshape291) R.vm.kill_object(lv43) model_decoder_layers_9_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[714] gv937: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc572: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv937, R.dtype("float16")) _570: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_9_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc572) R.vm.kill_object(model_decoder_layers_9_encoder_attn_k_proj_weight1) gv938: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape292: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc572, gv938, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc572) model_decoder_layers_9_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[715] model_decoder_layers_9_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[716] gv939: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc573: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv939, R.dtype("float16")) _571: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_9_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_9_encoder_attn_v_proj_bias1, alloc573) R.vm.kill_object(model_decoder_layers_9_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_9_encoder_attn_v_proj_bias1) gv940: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape293: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc573, gv940, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc573) gv941: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape294: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape292, gv941, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape292) gv942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape295: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape293, gv942, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape293) lv45: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv44, R.prim_value(9), reshape294, reshape295, sinfo_args=(R.Object,)) R.vm.kill_object(reshape294) R.vm.kill_object(reshape295) R.vm.kill_object(lv44) model_decoder_layers_10_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[738] gv943: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc574: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv943, R.dtype("float16")) _572: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_10_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc574) R.vm.kill_object(model_decoder_layers_10_encoder_attn_k_proj_weight1) gv944: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape296: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc574, gv944, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc574) model_decoder_layers_10_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[739] model_decoder_layers_10_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[740] gv945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc575: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv945, R.dtype("float16")) _573: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_10_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_10_encoder_attn_v_proj_bias1, alloc575) R.vm.kill_object(model_decoder_layers_10_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_10_encoder_attn_v_proj_bias1) gv946: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape297: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc575, gv946, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc575) gv947: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape298: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape296, gv947, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape296) gv948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape299: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape297, gv948, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape297) lv46: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv45, R.prim_value(10), reshape298, reshape299, sinfo_args=(R.Object,)) R.vm.kill_object(reshape298) R.vm.kill_object(reshape299) R.vm.kill_object(lv45) model_decoder_layers_11_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[762] gv949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc576: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv949, R.dtype("float16")) _574: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_11_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc576) R.vm.kill_object(model_decoder_layers_11_encoder_attn_k_proj_weight1) gv950: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape300: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc576, gv950, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc576) model_decoder_layers_11_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[763] model_decoder_layers_11_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[764] gv951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc577: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv951, R.dtype("float16")) _575: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_11_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_11_encoder_attn_v_proj_bias1, alloc577) R.vm.kill_object(model_decoder_layers_11_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_11_encoder_attn_v_proj_bias1) gv952: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape301: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc577, gv952, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc577) gv953: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape302: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape300, gv953, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape300) gv954: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape303: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape301, gv954, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape301) lv47: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv46, R.prim_value(11), reshape302, reshape303, sinfo_args=(R.Object,)) R.vm.kill_object(reshape302) R.vm.kill_object(reshape303) R.vm.kill_object(lv46) model_decoder_layers_12_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[786] gv955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc578: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv955, R.dtype("float16")) _576: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_12_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc578) R.vm.kill_object(model_decoder_layers_12_encoder_attn_k_proj_weight1) gv956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape304: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc578, gv956, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc578) model_decoder_layers_12_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[787] model_decoder_layers_12_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[788] gv957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc579: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv957, R.dtype("float16")) _577: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_12_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_12_encoder_attn_v_proj_bias1, alloc579) R.vm.kill_object(model_decoder_layers_12_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_12_encoder_attn_v_proj_bias1) gv958: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape305: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc579, gv958, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc579) gv959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape306: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape304, gv959, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape304) gv960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape307: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape305, gv960, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape305) lv48: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv47, R.prim_value(12), reshape306, reshape307, sinfo_args=(R.Object,)) R.vm.kill_object(reshape306) R.vm.kill_object(reshape307) R.vm.kill_object(lv47) model_decoder_layers_13_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[810] gv961: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc580: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv961, R.dtype("float16")) _578: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_13_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc580) R.vm.kill_object(model_decoder_layers_13_encoder_attn_k_proj_weight1) gv962: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape308: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc580, gv962, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc580) model_decoder_layers_13_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[811] model_decoder_layers_13_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[812] gv963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc581: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv963, R.dtype("float16")) _579: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_13_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_13_encoder_attn_v_proj_bias1, alloc581) R.vm.kill_object(model_decoder_layers_13_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_13_encoder_attn_v_proj_bias1) gv964: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape309: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc581, gv964, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc581) gv965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape310: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape308, gv965, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape308) gv966: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape311: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape309, gv966, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape309) lv49: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv48, R.prim_value(13), reshape310, reshape311, sinfo_args=(R.Object,)) R.vm.kill_object(reshape310) R.vm.kill_object(reshape311) R.vm.kill_object(lv48) model_decoder_layers_14_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[834] gv967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc582: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv967, R.dtype("float16")) _580: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_14_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc582) R.vm.kill_object(model_decoder_layers_14_encoder_attn_k_proj_weight1) gv968: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape312: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc582, gv968, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc582) model_decoder_layers_14_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[835] model_decoder_layers_14_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[836] gv969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc583: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv969, R.dtype("float16")) _581: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_14_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_14_encoder_attn_v_proj_bias1, alloc583) R.vm.kill_object(model_decoder_layers_14_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_14_encoder_attn_v_proj_bias1) gv970: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape313: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc583, gv970, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc583) gv971: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape314: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape312, gv971, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape312) gv972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape315: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape313, gv972, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape313) lv50: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv49, R.prim_value(14), reshape314, reshape315, sinfo_args=(R.Object,)) R.vm.kill_object(reshape314) R.vm.kill_object(reshape315) R.vm.kill_object(lv49) model_decoder_layers_15_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[858] gv973: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc584: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv973, R.dtype("float16")) _582: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_15_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc584) R.vm.kill_object(model_decoder_layers_15_encoder_attn_k_proj_weight1) gv974: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape316: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc584, gv974, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc584) model_decoder_layers_15_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[859] model_decoder_layers_15_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[860] gv975: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc585: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv975, R.dtype("float16")) _583: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_15_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_15_encoder_attn_v_proj_bias1, alloc585) R.vm.kill_object(model_decoder_layers_15_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_15_encoder_attn_v_proj_bias1) gv976: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape317: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc585, gv976, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc585) gv977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape318: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape316, gv977, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape316) gv978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape319: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape317, gv978, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape317) lv51: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv50, R.prim_value(15), reshape318, reshape319, sinfo_args=(R.Object,)) R.vm.kill_object(reshape318) R.vm.kill_object(reshape319) R.vm.kill_object(lv50) model_decoder_layers_16_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[882] gv979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc586: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv979, R.dtype("float16")) _584: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_16_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc586) R.vm.kill_object(model_decoder_layers_16_encoder_attn_k_proj_weight1) gv980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape320: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc586, gv980, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc586) model_decoder_layers_16_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[883] model_decoder_layers_16_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[884] gv981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc587: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv981, R.dtype("float16")) _585: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_16_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_16_encoder_attn_v_proj_bias1, alloc587) R.vm.kill_object(model_decoder_layers_16_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_16_encoder_attn_v_proj_bias1) gv982: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape321: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc587, gv982, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc587) gv983: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape322: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape320, gv983, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape320) gv984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape323: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape321, gv984, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape321) lv52: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv51, R.prim_value(16), reshape322, reshape323, sinfo_args=(R.Object,)) R.vm.kill_object(reshape322) R.vm.kill_object(reshape323) R.vm.kill_object(lv51) model_decoder_layers_17_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[906] gv985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc588: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv985, R.dtype("float16")) _586: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_17_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc588) R.vm.kill_object(model_decoder_layers_17_encoder_attn_k_proj_weight1) gv986: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape324: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc588, gv986, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc588) model_decoder_layers_17_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[907] model_decoder_layers_17_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[908] gv987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc589: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv987, R.dtype("float16")) _587: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_17_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_17_encoder_attn_v_proj_bias1, alloc589) R.vm.kill_object(model_decoder_layers_17_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_17_encoder_attn_v_proj_bias1) gv988: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape325: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc589, gv988, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc589) gv989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape326: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape324, gv989, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape324) gv990: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape327: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape325, gv990, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape325) lv53: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv52, R.prim_value(17), reshape326, reshape327, sinfo_args=(R.Object,)) R.vm.kill_object(reshape326) R.vm.kill_object(reshape327) R.vm.kill_object(lv52) model_decoder_layers_18_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[930] gv991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc590: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv991, R.dtype("float16")) _588: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_18_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc590) R.vm.kill_object(model_decoder_layers_18_encoder_attn_k_proj_weight1) gv992: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape328: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc590, gv992, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc590) model_decoder_layers_18_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[931] model_decoder_layers_18_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[932] gv993: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc591: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv993, R.dtype("float16")) _589: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_18_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_18_encoder_attn_v_proj_bias1, alloc591) R.vm.kill_object(model_decoder_layers_18_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_18_encoder_attn_v_proj_bias1) gv994: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape329: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc591, gv994, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc591) gv995: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape330: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape328, gv995, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape328) gv996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape331: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape329, gv996, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape329) lv54: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv53, R.prim_value(18), reshape330, reshape331, sinfo_args=(R.Object,)) R.vm.kill_object(reshape330) R.vm.kill_object(reshape331) R.vm.kill_object(lv53) model_decoder_layers_19_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[954] gv997: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc592: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv997, R.dtype("float16")) _590: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_19_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc592) R.vm.kill_object(model_decoder_layers_19_encoder_attn_k_proj_weight1) gv998: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape332: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc592, gv998, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc592) model_decoder_layers_19_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[955] model_decoder_layers_19_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[956] gv999: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc593: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv999, R.dtype("float16")) _591: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_19_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_19_encoder_attn_v_proj_bias1, alloc593) R.vm.kill_object(model_decoder_layers_19_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_19_encoder_attn_v_proj_bias1) gv1000: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape333: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc593, gv1000, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc593) gv1001: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape334: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape332, gv1001, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape332) gv1002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape335: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape333, gv1002, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape333) lv55: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv54, R.prim_value(19), reshape334, reshape335, sinfo_args=(R.Object,)) R.vm.kill_object(reshape334) R.vm.kill_object(reshape335) R.vm.kill_object(lv54) model_decoder_layers_20_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[978] gv1003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc594: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1003, R.dtype("float16")) _592: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_20_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc594) R.vm.kill_object(model_decoder_layers_20_encoder_attn_k_proj_weight1) gv1004: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape336: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc594, gv1004, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc594) model_decoder_layers_20_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[979] model_decoder_layers_20_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[980] gv1005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc595: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1005, R.dtype("float16")) _593: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_20_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_20_encoder_attn_v_proj_bias1, alloc595) R.vm.kill_object(model_decoder_layers_20_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_20_encoder_attn_v_proj_bias1) gv1006: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape337: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc595, gv1006, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc595) gv1007: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape338: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape336, gv1007, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape336) gv1008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape339: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape337, gv1008, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape337) lv56: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv55, R.prim_value(20), reshape338, reshape339, sinfo_args=(R.Object,)) R.vm.kill_object(reshape338) R.vm.kill_object(reshape339) R.vm.kill_object(lv55) model_decoder_layers_21_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1002] gv1009: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc596: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1009, R.dtype("float16")) _594: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_21_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc596) R.vm.kill_object(model_decoder_layers_21_encoder_attn_k_proj_weight1) gv1010: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape340: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc596, gv1010, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc596) model_decoder_layers_21_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1003] model_decoder_layers_21_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1004] gv1011: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc597: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1011, R.dtype("float16")) _595: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_21_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_21_encoder_attn_v_proj_bias1, alloc597) R.vm.kill_object(model_decoder_layers_21_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_21_encoder_attn_v_proj_bias1) gv1012: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape341: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc597, gv1012, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc597) gv1013: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape342: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape340, gv1013, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape340) gv1014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape343: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape341, gv1014, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape341) lv57: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv56, R.prim_value(21), reshape342, reshape343, sinfo_args=(R.Object,)) R.vm.kill_object(reshape342) R.vm.kill_object(reshape343) R.vm.kill_object(lv56) model_decoder_layers_22_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1026] gv1015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc598: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1015, R.dtype("float16")) _596: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_22_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc598) R.vm.kill_object(model_decoder_layers_22_encoder_attn_k_proj_weight1) gv1016: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape344: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc598, gv1016, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc598) model_decoder_layers_22_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1027] model_decoder_layers_22_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1028] gv1017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc599: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1017, R.dtype("float16")) _597: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_22_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_22_encoder_attn_v_proj_bias1, alloc599) R.vm.kill_object(model_decoder_layers_22_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_22_encoder_attn_v_proj_bias1) gv1018: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape345: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc599, gv1018, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc599) gv1019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape346: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape344, gv1019, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape344) gv1020: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape347: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape345, gv1020, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape345) lv58: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv57, R.prim_value(22), reshape346, reshape347, sinfo_args=(R.Object,)) R.vm.kill_object(reshape346) R.vm.kill_object(reshape347) R.vm.kill_object(lv57) model_decoder_layers_23_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1050] gv1021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc600: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1021, R.dtype("float16")) _598: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_23_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc600) R.vm.kill_object(model_decoder_layers_23_encoder_attn_k_proj_weight1) gv1022: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape348: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc600, gv1022, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc600) model_decoder_layers_23_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1051] model_decoder_layers_23_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1052] gv1023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc601: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1023, R.dtype("float16")) _599: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_23_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_23_encoder_attn_v_proj_bias1, alloc601) R.vm.kill_object(model_decoder_layers_23_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_23_encoder_attn_v_proj_bias1) gv1024: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape349: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc601, gv1024, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc601) gv1025: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape350: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape348, gv1025, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape348) gv1026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape351: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape349, gv1026, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape349) lv59: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv58, R.prim_value(23), reshape350, reshape351, sinfo_args=(R.Object,)) R.vm.kill_object(reshape350) R.vm.kill_object(reshape351) R.vm.kill_object(lv58) model_decoder_layers_24_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1074] gv1027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc602: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1027, R.dtype("float16")) _600: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_24_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc602) R.vm.kill_object(model_decoder_layers_24_encoder_attn_k_proj_weight1) gv1028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape352: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc602, gv1028, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc602) model_decoder_layers_24_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1075] model_decoder_layers_24_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1076] gv1029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc603: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1029, R.dtype("float16")) _601: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_24_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_24_encoder_attn_v_proj_bias1, alloc603) R.vm.kill_object(model_decoder_layers_24_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_24_encoder_attn_v_proj_bias1) gv1030: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape353: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc603, gv1030, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc603) gv1031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape354: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape352, gv1031, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape352) gv1032: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape355: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape353, gv1032, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape353) lv60: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv59, R.prim_value(24), reshape354, reshape355, sinfo_args=(R.Object,)) R.vm.kill_object(reshape354) R.vm.kill_object(reshape355) R.vm.kill_object(lv59) model_decoder_layers_25_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1098] gv1033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc604: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1033, R.dtype("float16")) _602: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_25_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc604) R.vm.kill_object(model_decoder_layers_25_encoder_attn_k_proj_weight1) gv1034: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape356: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc604, gv1034, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc604) model_decoder_layers_25_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1099] model_decoder_layers_25_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1100] gv1035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc605: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1035, R.dtype("float16")) _603: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_25_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_25_encoder_attn_v_proj_bias1, alloc605) R.vm.kill_object(model_decoder_layers_25_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_25_encoder_attn_v_proj_bias1) gv1036: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape357: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc605, gv1036, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc605) gv1037: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape358: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape356, gv1037, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape356) gv1038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape359: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape357, gv1038, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape357) lv61: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv60, R.prim_value(25), reshape358, reshape359, sinfo_args=(R.Object,)) R.vm.kill_object(reshape358) R.vm.kill_object(reshape359) R.vm.kill_object(lv60) model_decoder_layers_26_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1122] gv1039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc606: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1039, R.dtype("float16")) _604: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_26_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc606) R.vm.kill_object(model_decoder_layers_26_encoder_attn_k_proj_weight1) gv1040: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape360: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc606, gv1040, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc606) model_decoder_layers_26_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1123] model_decoder_layers_26_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1124] gv1041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc607: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1041, R.dtype("float16")) _605: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_26_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_26_encoder_attn_v_proj_bias1, alloc607) R.vm.kill_object(model_decoder_layers_26_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_26_encoder_attn_v_proj_bias1) gv1042: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape361: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc607, gv1042, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc607) gv1043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape362: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape360, gv1043, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape360) gv1044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape363: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape361, gv1044, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape361) lv62: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv61, R.prim_value(26), reshape362, reshape363, sinfo_args=(R.Object,)) R.vm.kill_object(reshape362) R.vm.kill_object(reshape363) R.vm.kill_object(lv61) model_decoder_layers_27_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1146] gv1045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc608: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1045, R.dtype("float16")) _606: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_27_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc608) R.vm.kill_object(model_decoder_layers_27_encoder_attn_k_proj_weight1) gv1046: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape364: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc608, gv1046, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc608) model_decoder_layers_27_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1147] model_decoder_layers_27_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1148] gv1047: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc609: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1047, R.dtype("float16")) _607: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_27_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_27_encoder_attn_v_proj_bias1, alloc609) R.vm.kill_object(model_decoder_layers_27_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_27_encoder_attn_v_proj_bias1) gv1048: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape365: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc609, gv1048, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc609) gv1049: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape366: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape364, gv1049, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape364) gv1050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape367: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape365, gv1050, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape365) lv63: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv62, R.prim_value(27), reshape366, reshape367, sinfo_args=(R.Object,)) R.vm.kill_object(reshape366) R.vm.kill_object(reshape367) R.vm.kill_object(lv62) model_decoder_layers_28_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1170] gv1051: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc610: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1051, R.dtype("float16")) _608: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_28_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc610) R.vm.kill_object(model_decoder_layers_28_encoder_attn_k_proj_weight1) gv1052: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape368: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc610, gv1052, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc610) model_decoder_layers_28_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1171] model_decoder_layers_28_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1172] gv1053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc611: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1053, R.dtype("float16")) _609: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_28_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_28_encoder_attn_v_proj_bias1, alloc611) R.vm.kill_object(model_decoder_layers_28_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_28_encoder_attn_v_proj_bias1) gv1054: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape369: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc611, gv1054, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc611) gv1055: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape370: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape368, gv1055, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape368) gv1056: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape371: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape369, gv1056, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape369) lv64: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv63, R.prim_value(28), reshape370, reshape371, sinfo_args=(R.Object,)) R.vm.kill_object(reshape370) R.vm.kill_object(reshape371) R.vm.kill_object(lv63) model_decoder_layers_29_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1194] gv1057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc612: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1057, R.dtype("float16")) _610: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_29_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc612) R.vm.kill_object(model_decoder_layers_29_encoder_attn_k_proj_weight1) gv1058: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape372: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc612, gv1058, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc612) model_decoder_layers_29_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1195] model_decoder_layers_29_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1196] gv1059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc613: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1059, R.dtype("float16")) _611: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_29_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_29_encoder_attn_v_proj_bias1, alloc613) R.vm.kill_object(model_decoder_layers_29_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_29_encoder_attn_v_proj_bias1) gv1060: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape373: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc613, gv1060, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc613) gv1061: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape374: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape372, gv1061, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape372) gv1062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape375: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape373, gv1062, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape373) lv65: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv64, R.prim_value(29), reshape374, reshape375, sinfo_args=(R.Object,)) R.vm.kill_object(reshape374) R.vm.kill_object(reshape375) R.vm.kill_object(lv64) model_decoder_layers_30_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1218] gv1063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc614: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1063, R.dtype("float16")) _612: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_30_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc614) R.vm.kill_object(model_decoder_layers_30_encoder_attn_k_proj_weight1) gv1064: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape376: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc614, gv1064, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc614) model_decoder_layers_30_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1219] model_decoder_layers_30_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1220] gv1065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc615: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1065, R.dtype("float16")) _613: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_30_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_30_encoder_attn_v_proj_bias1, alloc615) R.vm.kill_object(model_decoder_layers_30_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_30_encoder_attn_v_proj_bias1) gv1066: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape377: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc615, gv1066, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc615) gv1067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape378: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape376, gv1067, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape376) gv1068: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape379: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape377, gv1068, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape377) lv66: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv65, R.prim_value(30), reshape378, reshape379, sinfo_args=(R.Object,)) R.vm.kill_object(reshape378) R.vm.kill_object(reshape379) R.vm.kill_object(lv65) model_decoder_layers_31_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1242] gv1069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc616: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1069, R.dtype("float16")) R.vm.kill_object(storage11) _614: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_31_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc616) R.vm.kill_object(model_decoder_layers_31_encoder_attn_k_proj_weight1) gv1070: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape380: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc616, gv1070, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc616) model_decoder_layers_31_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1243] model_decoder_layers_31_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1244] gv1071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc617: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1071, R.dtype("float16")) R.vm.kill_object(storage12) _615: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_31_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_31_encoder_attn_v_proj_bias1, alloc617) R.vm.kill_object(model_decoder_layers_31_encoder_attn_v_proj_weight1) R.vm.kill_object(model_decoder_layers_31_encoder_attn_v_proj_bias1) gv1072: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape381: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc617, gv1072, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc617) gv1073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape382: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape380, gv1073, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape380) gv1074: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape383: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape381, gv1074, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape381) gv1: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv66, R.prim_value(31), reshape382, reshape383, sinfo_args=(R.Object,)) R.vm.kill_object(reshape382) R.vm.kill_object(reshape383) R.vm.kill_object(lv66) return gv1 @R.function def batch_decode(input_ids: R.Tensor(("batch_size", 1), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor(("batch_size", 1, 51866), dtype="float32"): batch_size = T.int64() R.func_attr({"num_input": 2, "relax.force_pure": 1, "relax.rewrite_cuda_graph.capture_symbolic_vars": ["batch_size"], "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=batch_decode, loc=param[0], param=input_ids, annotation=R.Tensor((batch_size, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_decode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.str("ErrorContext(fn=batch_decode, loc=param[0], param=input_ids, annotation=R.Tensor((batch_size, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) model_decoder_embed_tokens_weight3: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] gv1075: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) reshape707: R.Tensor((batch_size,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv1075, sinfo_args=(R.Tensor((batch_size,), dtype="int32"),)) model_decoder_embed_tokens_weight3_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] storage13: R.Object = R.vm.alloc_storage(R.shape([81920]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1076: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) alloc618: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1076, R.dtype("float16")) cls.take(model_decoder_embed_tokens_weight3_1, reshape707, alloc618) R.vm.kill_object(reshape707) R.vm.kill_object(model_decoder_embed_tokens_weight3_1) gv1077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape708: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc618, gv1077, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc618) lv133: R.Tensor((batch_size,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((batch_size,), dtype="int32"),)) model_decoder_embed_positions_weight3: R.Tensor((448, 1280), dtype="float16") = packed_params[488] storage14: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1078: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) alloc619: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1078, R.dtype("float16")) cls.take1(model_decoder_embed_positions_weight3, lv133, alloc619) R.vm.kill_object(lv133) R.vm.kill_object(model_decoder_embed_positions_weight3) gv1079: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape709: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc619, gv1079, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc619) storage15: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc620: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1080, R.dtype("float16")) cls.add(reshape708, reshape709, alloc620) R.vm.kill_object(reshape708) R.vm.kill_object(reshape709) model_decoder_layers_0_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[496] model_decoder_layers_0_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[497] gv1081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc621: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1081, R.dtype("float16")) cls.layer_norm(alloc620, model_decoder_layers_0_self_attn_layer_norm_weight3, model_decoder_layers_0_self_attn_layer_norm_bias3, alloc621) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias3) model_decoder_layers_0_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] model_decoder_layers_0_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[493] gv1082: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc622: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1082, R.dtype("float16")) _620: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_q_proj_weight3, alloc621, model_decoder_layers_0_self_attn_q_proj_bias3, alloc622) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias3) gv1083: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape710: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc622, gv1083, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc622) model_decoder_layers_0_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] storage16: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc623: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1084, R.dtype("float16")) _621: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_0_self_attn_k_proj_weight3, alloc621, alloc623) R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight3) gv1085: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape711: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc623, gv1085, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc623) model_decoder_layers_0_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] model_decoder_layers_0_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[491] storage17: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc624: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1086, R.dtype("float16")) _622: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_v_proj_weight3, alloc621, model_decoder_layers_0_self_attn_v_proj_bias3, alloc624) R.vm.kill_object(alloc621) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias3) gv1087: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape712: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc624, gv1087, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc624) gv1088: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc625: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1088, R.dtype("float16")) cls.concatenate(reshape710, reshape711, reshape712, alloc625) R.vm.kill_object(reshape710) R.vm.kill_object(reshape711) R.vm.kill_object(reshape712) gv1089: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape713: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc625, gv1089, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc625) gv1090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc626: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1090, R.dtype("float16")) _624: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape713, alloc626) R.vm.kill_object(reshape713) gv1091: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape714: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc626, gv1091, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc626) gv1092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape715: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape714, gv1092, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape714) model_decoder_layers_0_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] model_decoder_layers_0_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[495] gv1093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc627: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1093, R.dtype("float16")) _625: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_out_proj_weight3, reshape715, model_decoder_layers_0_self_attn_out_proj_bias3, alloc627) R.vm.kill_object(reshape715) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias3) gv1094: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc628: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1094, R.dtype("float16")) cls.add(alloc620, alloc627, alloc628) R.vm.kill_object(alloc620) R.vm.kill_object(alloc627) model_decoder_layers_0_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[505] model_decoder_layers_0_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[506] gv1095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc629: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1095, R.dtype("float16")) cls.layer_norm(alloc628, model_decoder_layers_0_encoder_attn_layer_norm_weight3, model_decoder_layers_0_encoder_attn_layer_norm_bias3, alloc629) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias3) model_decoder_layers_0_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] model_decoder_layers_0_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[502] gv1096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc630: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1096, R.dtype("float16")) _628: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight3, alloc629, model_decoder_layers_0_encoder_attn_q_proj_bias3, alloc630) R.vm.kill_object(alloc629) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias3) gv1097: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape716: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc630, gv1097, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc630) gv1098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape717: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape716, gv1098, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape716) gv1099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc631: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1099, R.dtype("float16")) _629: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape717, alloc631) R.vm.kill_object(reshape717) gv1100: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape718: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc631, gv1100, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc631) gv1101: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape719: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape718, gv1101, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape718) model_decoder_layers_0_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] model_decoder_layers_0_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[504] gv1102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc632: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1102, R.dtype("float16")) _630: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight3, reshape719, model_decoder_layers_0_encoder_attn_out_proj_bias3, alloc632) R.vm.kill_object(reshape719) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias3) gv1103: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc633: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1103, R.dtype("float16")) cls.add(alloc628, alloc632, alloc633) R.vm.kill_object(alloc628) R.vm.kill_object(alloc632) model_decoder_layers_0_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[511] model_decoder_layers_0_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[512] gv1104: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc634: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1104, R.dtype("float16")) cls.layer_norm(alloc633, model_decoder_layers_0_final_layer_norm_weight3, model_decoder_layers_0_final_layer_norm_bias3, alloc634) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias3) model_decoder_layers_0_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] model_decoder_layers_0_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[508] gv1105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc635: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1105, R.dtype("float16")) _633: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_0_fc1_weight3, alloc634, model_decoder_layers_0_fc1_bias3, alloc635) R.vm.kill_object(alloc634) R.vm.kill_object(model_decoder_layers_0_fc1_weight3) R.vm.kill_object(model_decoder_layers_0_fc1_bias3) model_decoder_layers_0_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] model_decoder_layers_0_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[510] gv1106: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc636: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1106, R.dtype("float16")) _634: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_0_fc2_weight3, alloc635, model_decoder_layers_0_fc2_bias3, alloc636) R.vm.kill_object(alloc635) R.vm.kill_object(model_decoder_layers_0_fc2_weight3) R.vm.kill_object(model_decoder_layers_0_fc2_bias3) gv1107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc637: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1107, R.dtype("float16")) cls.add(alloc633, alloc636, alloc637) R.vm.kill_object(alloc633) R.vm.kill_object(alloc636) model_decoder_layers_1_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[520] model_decoder_layers_1_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[521] gv1108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc638: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1108, R.dtype("float16")) cls.layer_norm(alloc637, model_decoder_layers_1_self_attn_layer_norm_weight3, model_decoder_layers_1_self_attn_layer_norm_bias3, alloc638) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias3) model_decoder_layers_1_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] model_decoder_layers_1_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[517] gv1109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc639: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1109, R.dtype("float16")) _637: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_q_proj_weight3, alloc638, model_decoder_layers_1_self_attn_q_proj_bias3, alloc639) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias3) gv1110: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape720: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc639, gv1110, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc639) model_decoder_layers_1_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] gv1111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc640: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1111, R.dtype("float16")) _638: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_1_self_attn_k_proj_weight3, alloc638, alloc640) R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight3) gv1112: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape721: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc640, gv1112, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc640) model_decoder_layers_1_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] model_decoder_layers_1_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[515] gv1113: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc641: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1113, R.dtype("float16")) _639: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_v_proj_weight3, alloc638, model_decoder_layers_1_self_attn_v_proj_bias3, alloc641) R.vm.kill_object(alloc638) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias3) gv1114: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape722: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc641, gv1114, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc641) gv1115: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc642: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1115, R.dtype("float16")) cls.concatenate(reshape720, reshape721, reshape722, alloc642) R.vm.kill_object(reshape720) R.vm.kill_object(reshape721) R.vm.kill_object(reshape722) gv1116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape723: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc642, gv1116, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc642) gv1117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc643: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1117, R.dtype("float16")) _641: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape723, alloc643) R.vm.kill_object(reshape723) gv1118: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape724: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc643, gv1118, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc643) gv1119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape725: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape724, gv1119, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape724) model_decoder_layers_1_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] model_decoder_layers_1_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[519] gv1120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc644: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1120, R.dtype("float16")) _642: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_out_proj_weight3, reshape725, model_decoder_layers_1_self_attn_out_proj_bias3, alloc644) R.vm.kill_object(reshape725) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias3) gv1121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc645: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1121, R.dtype("float16")) cls.add(alloc637, alloc644, alloc645) R.vm.kill_object(alloc637) R.vm.kill_object(alloc644) model_decoder_layers_1_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[529] model_decoder_layers_1_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[530] gv1122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc646: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1122, R.dtype("float16")) cls.layer_norm(alloc645, model_decoder_layers_1_encoder_attn_layer_norm_weight3, model_decoder_layers_1_encoder_attn_layer_norm_bias3, alloc646) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias3) model_decoder_layers_1_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] model_decoder_layers_1_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[526] gv1123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc647: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1123, R.dtype("float16")) _645: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight3, alloc646, model_decoder_layers_1_encoder_attn_q_proj_bias3, alloc647) R.vm.kill_object(alloc646) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias3) gv1124: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape726: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc647, gv1124, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc647) gv1125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape727: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape726, gv1125, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape726) gv1126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc648: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1126, R.dtype("float16")) _646: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape727, alloc648) R.vm.kill_object(reshape727) gv1127: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape728: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc648, gv1127, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc648) gv1128: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape729: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape728, gv1128, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape728) model_decoder_layers_1_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] model_decoder_layers_1_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[528] gv1129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc649: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1129, R.dtype("float16")) _647: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight3, reshape729, model_decoder_layers_1_encoder_attn_out_proj_bias3, alloc649) R.vm.kill_object(reshape729) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias3) gv1130: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc650: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1130, R.dtype("float16")) cls.add(alloc645, alloc649, alloc650) R.vm.kill_object(alloc645) R.vm.kill_object(alloc649) model_decoder_layers_1_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[535] model_decoder_layers_1_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[536] gv1131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc651: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1131, R.dtype("float16")) cls.layer_norm(alloc650, model_decoder_layers_1_final_layer_norm_weight3, model_decoder_layers_1_final_layer_norm_bias3, alloc651) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias3) model_decoder_layers_1_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] model_decoder_layers_1_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[532] gv1132: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc652: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1132, R.dtype("float16")) _650: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_1_fc1_weight3, alloc651, model_decoder_layers_1_fc1_bias3, alloc652) R.vm.kill_object(alloc651) R.vm.kill_object(model_decoder_layers_1_fc1_weight3) R.vm.kill_object(model_decoder_layers_1_fc1_bias3) model_decoder_layers_1_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] model_decoder_layers_1_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[534] gv1133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc653: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1133, R.dtype("float16")) _651: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_1_fc2_weight3, alloc652, model_decoder_layers_1_fc2_bias3, alloc653) R.vm.kill_object(alloc652) R.vm.kill_object(model_decoder_layers_1_fc2_weight3) R.vm.kill_object(model_decoder_layers_1_fc2_bias3) gv1134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc654: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1134, R.dtype("float16")) cls.add(alloc650, alloc653, alloc654) R.vm.kill_object(alloc650) R.vm.kill_object(alloc653) model_decoder_layers_2_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[544] model_decoder_layers_2_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[545] gv1135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc655: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1135, R.dtype("float16")) cls.layer_norm(alloc654, model_decoder_layers_2_self_attn_layer_norm_weight3, model_decoder_layers_2_self_attn_layer_norm_bias3, alloc655) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias3) model_decoder_layers_2_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] model_decoder_layers_2_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[541] gv1136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc656: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1136, R.dtype("float16")) _654: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_q_proj_weight3, alloc655, model_decoder_layers_2_self_attn_q_proj_bias3, alloc656) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias3) gv1137: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape730: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc656, gv1137, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc656) model_decoder_layers_2_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] gv1138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc657: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1138, R.dtype("float16")) _655: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_2_self_attn_k_proj_weight3, alloc655, alloc657) R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight3) gv1139: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape731: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc657, gv1139, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc657) model_decoder_layers_2_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] model_decoder_layers_2_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[539] gv1140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc658: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1140, R.dtype("float16")) _656: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_v_proj_weight3, alloc655, model_decoder_layers_2_self_attn_v_proj_bias3, alloc658) R.vm.kill_object(alloc655) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias3) gv1141: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape732: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc658, gv1141, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc658) gv1142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc659: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1142, R.dtype("float16")) cls.concatenate(reshape730, reshape731, reshape732, alloc659) R.vm.kill_object(reshape730) R.vm.kill_object(reshape731) R.vm.kill_object(reshape732) gv1143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape733: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc659, gv1143, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc659) gv1144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc660: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1144, R.dtype("float16")) _658: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape733, alloc660) R.vm.kill_object(reshape733) gv1145: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape734: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc660, gv1145, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc660) gv1146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape735: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape734, gv1146, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape734) model_decoder_layers_2_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] model_decoder_layers_2_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[543] gv1147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc661: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1147, R.dtype("float16")) _659: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_out_proj_weight3, reshape735, model_decoder_layers_2_self_attn_out_proj_bias3, alloc661) R.vm.kill_object(reshape735) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias3) gv1148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc662: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1148, R.dtype("float16")) cls.add(alloc654, alloc661, alloc662) R.vm.kill_object(alloc654) R.vm.kill_object(alloc661) model_decoder_layers_2_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[553] model_decoder_layers_2_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[554] gv1149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc663: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1149, R.dtype("float16")) cls.layer_norm(alloc662, model_decoder_layers_2_encoder_attn_layer_norm_weight3, model_decoder_layers_2_encoder_attn_layer_norm_bias3, alloc663) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias3) model_decoder_layers_2_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] model_decoder_layers_2_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[550] gv1150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc664: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1150, R.dtype("float16")) _662: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight3, alloc663, model_decoder_layers_2_encoder_attn_q_proj_bias3, alloc664) R.vm.kill_object(alloc663) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias3) gv1151: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape736: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc664, gv1151, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc664) gv1152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape737: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape736, gv1152, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape736) gv1153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc665: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1153, R.dtype("float16")) _663: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape737, alloc665) R.vm.kill_object(reshape737) gv1154: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape738: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc665, gv1154, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc665) gv1155: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape739: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape738, gv1155, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape738) model_decoder_layers_2_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] model_decoder_layers_2_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[552] gv1156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc666: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1156, R.dtype("float16")) _664: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight3, reshape739, model_decoder_layers_2_encoder_attn_out_proj_bias3, alloc666) R.vm.kill_object(reshape739) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias3) gv1157: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc667: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1157, R.dtype("float16")) cls.add(alloc662, alloc666, alloc667) R.vm.kill_object(alloc662) R.vm.kill_object(alloc666) model_decoder_layers_2_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[559] model_decoder_layers_2_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[560] gv1158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc668: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1158, R.dtype("float16")) cls.layer_norm(alloc667, model_decoder_layers_2_final_layer_norm_weight3, model_decoder_layers_2_final_layer_norm_bias3, alloc668) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias3) model_decoder_layers_2_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] model_decoder_layers_2_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[556] gv1159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc669: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1159, R.dtype("float16")) _667: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_2_fc1_weight3, alloc668, model_decoder_layers_2_fc1_bias3, alloc669) R.vm.kill_object(alloc668) R.vm.kill_object(model_decoder_layers_2_fc1_weight3) R.vm.kill_object(model_decoder_layers_2_fc1_bias3) model_decoder_layers_2_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] model_decoder_layers_2_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[558] gv1160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc670: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1160, R.dtype("float16")) _668: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_2_fc2_weight3, alloc669, model_decoder_layers_2_fc2_bias3, alloc670) R.vm.kill_object(alloc669) R.vm.kill_object(model_decoder_layers_2_fc2_weight3) R.vm.kill_object(model_decoder_layers_2_fc2_bias3) gv1161: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc671: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1161, R.dtype("float16")) cls.add(alloc667, alloc670, alloc671) R.vm.kill_object(alloc667) R.vm.kill_object(alloc670) model_decoder_layers_3_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[568] model_decoder_layers_3_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[569] gv1162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc672: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1162, R.dtype("float16")) cls.layer_norm(alloc671, model_decoder_layers_3_self_attn_layer_norm_weight3, model_decoder_layers_3_self_attn_layer_norm_bias3, alloc672) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias3) model_decoder_layers_3_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] model_decoder_layers_3_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[565] gv1163: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc673: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1163, R.dtype("float16")) _671: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_q_proj_weight3, alloc672, model_decoder_layers_3_self_attn_q_proj_bias3, alloc673) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias3) gv1164: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape740: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc673, gv1164, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc673) model_decoder_layers_3_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] gv1165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc674: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1165, R.dtype("float16")) _672: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_3_self_attn_k_proj_weight3, alloc672, alloc674) R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight3) gv1166: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape741: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc674, gv1166, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc674) model_decoder_layers_3_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] model_decoder_layers_3_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[563] gv1167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc675: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1167, R.dtype("float16")) _673: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_v_proj_weight3, alloc672, model_decoder_layers_3_self_attn_v_proj_bias3, alloc675) R.vm.kill_object(alloc672) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias3) gv1168: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape742: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc675, gv1168, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc675) gv1169: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc676: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1169, R.dtype("float16")) cls.concatenate(reshape740, reshape741, reshape742, alloc676) R.vm.kill_object(reshape740) R.vm.kill_object(reshape741) R.vm.kill_object(reshape742) gv1170: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape743: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc676, gv1170, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc676) gv1171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc677: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1171, R.dtype("float16")) _675: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape743, alloc677) R.vm.kill_object(reshape743) gv1172: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape744: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc677, gv1172, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc677) gv1173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape745: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape744, gv1173, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape744) model_decoder_layers_3_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] model_decoder_layers_3_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[567] gv1174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc678: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1174, R.dtype("float16")) _676: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_out_proj_weight3, reshape745, model_decoder_layers_3_self_attn_out_proj_bias3, alloc678) R.vm.kill_object(reshape745) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias3) gv1175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc679: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1175, R.dtype("float16")) cls.add(alloc671, alloc678, alloc679) R.vm.kill_object(alloc671) R.vm.kill_object(alloc678) model_decoder_layers_3_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[577] model_decoder_layers_3_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[578] gv1176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc680: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1176, R.dtype("float16")) cls.layer_norm(alloc679, model_decoder_layers_3_encoder_attn_layer_norm_weight3, model_decoder_layers_3_encoder_attn_layer_norm_bias3, alloc680) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias3) model_decoder_layers_3_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] model_decoder_layers_3_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[574] gv1177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc681: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1177, R.dtype("float16")) _679: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight3, alloc680, model_decoder_layers_3_encoder_attn_q_proj_bias3, alloc681) R.vm.kill_object(alloc680) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias3) gv1178: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape746: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc681, gv1178, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc681) gv1179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape747: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape746, gv1179, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape746) gv1180: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc682: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1180, R.dtype("float16")) _680: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape747, alloc682) R.vm.kill_object(reshape747) gv1181: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape748: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc682, gv1181, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc682) gv1182: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape749: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape748, gv1182, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape748) model_decoder_layers_3_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] model_decoder_layers_3_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[576] gv1183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc683: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1183, R.dtype("float16")) _681: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight3, reshape749, model_decoder_layers_3_encoder_attn_out_proj_bias3, alloc683) R.vm.kill_object(reshape749) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias3) gv1184: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc684: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1184, R.dtype("float16")) cls.add(alloc679, alloc683, alloc684) R.vm.kill_object(alloc679) R.vm.kill_object(alloc683) model_decoder_layers_3_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[583] model_decoder_layers_3_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[584] gv1185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc685: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1185, R.dtype("float16")) cls.layer_norm(alloc684, model_decoder_layers_3_final_layer_norm_weight3, model_decoder_layers_3_final_layer_norm_bias3, alloc685) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias3) model_decoder_layers_3_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] model_decoder_layers_3_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[580] gv1186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc686: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1186, R.dtype("float16")) _684: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_3_fc1_weight3, alloc685, model_decoder_layers_3_fc1_bias3, alloc686) R.vm.kill_object(alloc685) R.vm.kill_object(model_decoder_layers_3_fc1_weight3) R.vm.kill_object(model_decoder_layers_3_fc1_bias3) model_decoder_layers_3_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] model_decoder_layers_3_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[582] gv1187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc687: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1187, R.dtype("float16")) _685: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_3_fc2_weight3, alloc686, model_decoder_layers_3_fc2_bias3, alloc687) R.vm.kill_object(alloc686) R.vm.kill_object(model_decoder_layers_3_fc2_weight3) R.vm.kill_object(model_decoder_layers_3_fc2_bias3) gv1188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc688: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1188, R.dtype("float16")) cls.add(alloc684, alloc687, alloc688) R.vm.kill_object(alloc684) R.vm.kill_object(alloc687) model_decoder_layers_4_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[592] model_decoder_layers_4_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[593] gv1189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc689: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1189, R.dtype("float16")) cls.layer_norm(alloc688, model_decoder_layers_4_self_attn_layer_norm_weight3, model_decoder_layers_4_self_attn_layer_norm_bias3, alloc689) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias3) model_decoder_layers_4_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] model_decoder_layers_4_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[589] gv1190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc690: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1190, R.dtype("float16")) _688: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_q_proj_weight3, alloc689, model_decoder_layers_4_self_attn_q_proj_bias3, alloc690) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias3) gv1191: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape750: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc690, gv1191, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc690) model_decoder_layers_4_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] gv1192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc691: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1192, R.dtype("float16")) _689: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_4_self_attn_k_proj_weight3, alloc689, alloc691) R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight3) gv1193: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape751: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc691, gv1193, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc691) model_decoder_layers_4_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] model_decoder_layers_4_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[587] gv1194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc692: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1194, R.dtype("float16")) _690: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_v_proj_weight3, alloc689, model_decoder_layers_4_self_attn_v_proj_bias3, alloc692) R.vm.kill_object(alloc689) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias3) gv1195: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape752: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc692, gv1195, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc692) gv1196: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc693: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1196, R.dtype("float16")) cls.concatenate(reshape750, reshape751, reshape752, alloc693) R.vm.kill_object(reshape750) R.vm.kill_object(reshape751) R.vm.kill_object(reshape752) gv1197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape753: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc693, gv1197, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc693) gv1198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc694: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1198, R.dtype("float16")) _692: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape753, alloc694) R.vm.kill_object(reshape753) gv1199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape754: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc694, gv1199, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc694) gv1200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape755: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape754, gv1200, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape754) model_decoder_layers_4_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] model_decoder_layers_4_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[591] gv1201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc695: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1201, R.dtype("float16")) _693: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_out_proj_weight3, reshape755, model_decoder_layers_4_self_attn_out_proj_bias3, alloc695) R.vm.kill_object(reshape755) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias3) gv1202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc696: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1202, R.dtype("float16")) cls.add(alloc688, alloc695, alloc696) R.vm.kill_object(alloc688) R.vm.kill_object(alloc695) model_decoder_layers_4_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[601] model_decoder_layers_4_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[602] gv1203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc697: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1203, R.dtype("float16")) cls.layer_norm(alloc696, model_decoder_layers_4_encoder_attn_layer_norm_weight3, model_decoder_layers_4_encoder_attn_layer_norm_bias3, alloc697) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias3) model_decoder_layers_4_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] model_decoder_layers_4_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[598] gv1204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc698: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1204, R.dtype("float16")) _696: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight3, alloc697, model_decoder_layers_4_encoder_attn_q_proj_bias3, alloc698) R.vm.kill_object(alloc697) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias3) gv1205: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape756: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc698, gv1205, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc698) gv1206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape757: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape756, gv1206, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape756) gv1207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc699: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1207, R.dtype("float16")) _697: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape757, alloc699) R.vm.kill_object(reshape757) gv1208: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape758: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc699, gv1208, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc699) gv1209: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape759: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape758, gv1209, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape758) model_decoder_layers_4_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] model_decoder_layers_4_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[600] gv1210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc700: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1210, R.dtype("float16")) _698: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight3, reshape759, model_decoder_layers_4_encoder_attn_out_proj_bias3, alloc700) R.vm.kill_object(reshape759) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias3) gv1211: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc701: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1211, R.dtype("float16")) cls.add(alloc696, alloc700, alloc701) R.vm.kill_object(alloc696) R.vm.kill_object(alloc700) model_decoder_layers_4_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[607] model_decoder_layers_4_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[608] gv1212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc702: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1212, R.dtype("float16")) cls.layer_norm(alloc701, model_decoder_layers_4_final_layer_norm_weight3, model_decoder_layers_4_final_layer_norm_bias3, alloc702) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias3) model_decoder_layers_4_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] model_decoder_layers_4_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[604] gv1213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc703: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1213, R.dtype("float16")) _701: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_4_fc1_weight3, alloc702, model_decoder_layers_4_fc1_bias3, alloc703) R.vm.kill_object(alloc702) R.vm.kill_object(model_decoder_layers_4_fc1_weight3) R.vm.kill_object(model_decoder_layers_4_fc1_bias3) model_decoder_layers_4_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] model_decoder_layers_4_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[606] gv1214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc704: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1214, R.dtype("float16")) _702: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_4_fc2_weight3, alloc703, model_decoder_layers_4_fc2_bias3, alloc704) R.vm.kill_object(alloc703) R.vm.kill_object(model_decoder_layers_4_fc2_weight3) R.vm.kill_object(model_decoder_layers_4_fc2_bias3) gv1215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc705: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1215, R.dtype("float16")) cls.add(alloc701, alloc704, alloc705) R.vm.kill_object(alloc701) R.vm.kill_object(alloc704) model_decoder_layers_5_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[616] model_decoder_layers_5_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[617] gv1216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc706: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1216, R.dtype("float16")) cls.layer_norm(alloc705, model_decoder_layers_5_self_attn_layer_norm_weight3, model_decoder_layers_5_self_attn_layer_norm_bias3, alloc706) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias3) model_decoder_layers_5_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] model_decoder_layers_5_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[613] gv1217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc707: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1217, R.dtype("float16")) _705: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_q_proj_weight3, alloc706, model_decoder_layers_5_self_attn_q_proj_bias3, alloc707) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias3) gv1218: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape760: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc707, gv1218, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc707) model_decoder_layers_5_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] gv1219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc708: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1219, R.dtype("float16")) _706: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_5_self_attn_k_proj_weight3, alloc706, alloc708) R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight3) gv1220: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape761: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc708, gv1220, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc708) model_decoder_layers_5_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] model_decoder_layers_5_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[611] gv1221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc709: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1221, R.dtype("float16")) _707: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_v_proj_weight3, alloc706, model_decoder_layers_5_self_attn_v_proj_bias3, alloc709) R.vm.kill_object(alloc706) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias3) gv1222: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape762: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc709, gv1222, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc709) gv1223: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc710: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1223, R.dtype("float16")) cls.concatenate(reshape760, reshape761, reshape762, alloc710) R.vm.kill_object(reshape760) R.vm.kill_object(reshape761) R.vm.kill_object(reshape762) gv1224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape763: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc710, gv1224, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc710) gv1225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc711: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1225, R.dtype("float16")) _709: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape763, alloc711) R.vm.kill_object(reshape763) gv1226: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape764: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc711, gv1226, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc711) gv1227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape765: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape764, gv1227, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape764) model_decoder_layers_5_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] model_decoder_layers_5_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[615] gv1228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc712: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1228, R.dtype("float16")) _710: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_out_proj_weight3, reshape765, model_decoder_layers_5_self_attn_out_proj_bias3, alloc712) R.vm.kill_object(reshape765) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias3) gv1229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc713: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1229, R.dtype("float16")) cls.add(alloc705, alloc712, alloc713) R.vm.kill_object(alloc705) R.vm.kill_object(alloc712) model_decoder_layers_5_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[625] model_decoder_layers_5_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[626] gv1230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc714: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1230, R.dtype("float16")) cls.layer_norm(alloc713, model_decoder_layers_5_encoder_attn_layer_norm_weight3, model_decoder_layers_5_encoder_attn_layer_norm_bias3, alloc714) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias3) model_decoder_layers_5_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] model_decoder_layers_5_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[622] gv1231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc715: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1231, R.dtype("float16")) _713: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight3, alloc714, model_decoder_layers_5_encoder_attn_q_proj_bias3, alloc715) R.vm.kill_object(alloc714) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias3) gv1232: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape766: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc715, gv1232, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc715) gv1233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape767: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape766, gv1233, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape766) gv1234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc716: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1234, R.dtype("float16")) _714: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape767, alloc716) R.vm.kill_object(reshape767) gv1235: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape768: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc716, gv1235, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc716) gv1236: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape769: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape768, gv1236, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape768) model_decoder_layers_5_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] model_decoder_layers_5_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[624] gv1237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc717: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1237, R.dtype("float16")) _715: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight3, reshape769, model_decoder_layers_5_encoder_attn_out_proj_bias3, alloc717) R.vm.kill_object(reshape769) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias3) gv1238: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc718: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1238, R.dtype("float16")) cls.add(alloc713, alloc717, alloc718) R.vm.kill_object(alloc713) R.vm.kill_object(alloc717) model_decoder_layers_5_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[631] model_decoder_layers_5_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[632] gv1239: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc719: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1239, R.dtype("float16")) cls.layer_norm(alloc718, model_decoder_layers_5_final_layer_norm_weight3, model_decoder_layers_5_final_layer_norm_bias3, alloc719) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias3) model_decoder_layers_5_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] model_decoder_layers_5_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[628] gv1240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc720: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1240, R.dtype("float16")) _718: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_5_fc1_weight3, alloc719, model_decoder_layers_5_fc1_bias3, alloc720) R.vm.kill_object(alloc719) R.vm.kill_object(model_decoder_layers_5_fc1_weight3) R.vm.kill_object(model_decoder_layers_5_fc1_bias3) model_decoder_layers_5_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] model_decoder_layers_5_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[630] gv1241: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc721: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1241, R.dtype("float16")) _719: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_5_fc2_weight3, alloc720, model_decoder_layers_5_fc2_bias3, alloc721) R.vm.kill_object(alloc720) R.vm.kill_object(model_decoder_layers_5_fc2_weight3) R.vm.kill_object(model_decoder_layers_5_fc2_bias3) gv1242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc722: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1242, R.dtype("float16")) cls.add(alloc718, alloc721, alloc722) R.vm.kill_object(alloc718) R.vm.kill_object(alloc721) model_decoder_layers_6_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[640] model_decoder_layers_6_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[641] gv1243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc723: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1243, R.dtype("float16")) cls.layer_norm(alloc722, model_decoder_layers_6_self_attn_layer_norm_weight3, model_decoder_layers_6_self_attn_layer_norm_bias3, alloc723) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias3) model_decoder_layers_6_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] model_decoder_layers_6_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[637] gv1244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc724: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1244, R.dtype("float16")) _722: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_q_proj_weight3, alloc723, model_decoder_layers_6_self_attn_q_proj_bias3, alloc724) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias3) gv1245: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape770: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc724, gv1245, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc724) model_decoder_layers_6_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] gv1246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc725: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1246, R.dtype("float16")) _723: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_6_self_attn_k_proj_weight3, alloc723, alloc725) R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight3) gv1247: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape771: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc725, gv1247, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc725) model_decoder_layers_6_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] model_decoder_layers_6_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[635] gv1248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc726: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1248, R.dtype("float16")) _724: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_v_proj_weight3, alloc723, model_decoder_layers_6_self_attn_v_proj_bias3, alloc726) R.vm.kill_object(alloc723) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias3) gv1249: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape772: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc726, gv1249, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc726) gv1250: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc727: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1250, R.dtype("float16")) cls.concatenate(reshape770, reshape771, reshape772, alloc727) R.vm.kill_object(reshape770) R.vm.kill_object(reshape771) R.vm.kill_object(reshape772) gv1251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape773: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc727, gv1251, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc727) gv1252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc728: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1252, R.dtype("float16")) _726: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape773, alloc728) R.vm.kill_object(reshape773) gv1253: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape774: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc728, gv1253, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc728) gv1254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape775: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape774, gv1254, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape774) model_decoder_layers_6_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] model_decoder_layers_6_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[639] gv1255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc729: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1255, R.dtype("float16")) _727: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_out_proj_weight3, reshape775, model_decoder_layers_6_self_attn_out_proj_bias3, alloc729) R.vm.kill_object(reshape775) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias3) gv1256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc730: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1256, R.dtype("float16")) cls.add(alloc722, alloc729, alloc730) R.vm.kill_object(alloc722) R.vm.kill_object(alloc729) model_decoder_layers_6_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[649] model_decoder_layers_6_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[650] gv1257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc731: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1257, R.dtype("float16")) cls.layer_norm(alloc730, model_decoder_layers_6_encoder_attn_layer_norm_weight3, model_decoder_layers_6_encoder_attn_layer_norm_bias3, alloc731) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias3) model_decoder_layers_6_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] model_decoder_layers_6_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[646] gv1258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc732: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1258, R.dtype("float16")) _730: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight3, alloc731, model_decoder_layers_6_encoder_attn_q_proj_bias3, alloc732) R.vm.kill_object(alloc731) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias3) gv1259: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape776: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc732, gv1259, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc732) gv1260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape777: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape776, gv1260, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape776) gv1261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc733: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1261, R.dtype("float16")) _731: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape777, alloc733) R.vm.kill_object(reshape777) gv1262: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape778: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc733, gv1262, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc733) gv1263: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape779: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape778, gv1263, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape778) model_decoder_layers_6_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] model_decoder_layers_6_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[648] gv1264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc734: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1264, R.dtype("float16")) _732: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight3, reshape779, model_decoder_layers_6_encoder_attn_out_proj_bias3, alloc734) R.vm.kill_object(reshape779) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias3) gv1265: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc735: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1265, R.dtype("float16")) cls.add(alloc730, alloc734, alloc735) R.vm.kill_object(alloc730) R.vm.kill_object(alloc734) model_decoder_layers_6_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[655] model_decoder_layers_6_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[656] gv1266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc736: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1266, R.dtype("float16")) cls.layer_norm(alloc735, model_decoder_layers_6_final_layer_norm_weight3, model_decoder_layers_6_final_layer_norm_bias3, alloc736) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias3) model_decoder_layers_6_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] model_decoder_layers_6_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[652] gv1267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc737: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1267, R.dtype("float16")) _735: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_6_fc1_weight3, alloc736, model_decoder_layers_6_fc1_bias3, alloc737) R.vm.kill_object(alloc736) R.vm.kill_object(model_decoder_layers_6_fc1_weight3) R.vm.kill_object(model_decoder_layers_6_fc1_bias3) model_decoder_layers_6_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] model_decoder_layers_6_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[654] gv1268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc738: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1268, R.dtype("float16")) _736: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_6_fc2_weight3, alloc737, model_decoder_layers_6_fc2_bias3, alloc738) R.vm.kill_object(alloc737) R.vm.kill_object(model_decoder_layers_6_fc2_weight3) R.vm.kill_object(model_decoder_layers_6_fc2_bias3) gv1269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc739: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1269, R.dtype("float16")) cls.add(alloc735, alloc738, alloc739) R.vm.kill_object(alloc735) R.vm.kill_object(alloc738) model_decoder_layers_7_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[664] model_decoder_layers_7_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[665] gv1270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc740: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1270, R.dtype("float16")) cls.layer_norm(alloc739, model_decoder_layers_7_self_attn_layer_norm_weight3, model_decoder_layers_7_self_attn_layer_norm_bias3, alloc740) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias3) model_decoder_layers_7_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] model_decoder_layers_7_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[661] gv1271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc741: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1271, R.dtype("float16")) _739: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_q_proj_weight3, alloc740, model_decoder_layers_7_self_attn_q_proj_bias3, alloc741) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias3) gv1272: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape780: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc741, gv1272, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc741) model_decoder_layers_7_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] gv1273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc742: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1273, R.dtype("float16")) _740: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_7_self_attn_k_proj_weight3, alloc740, alloc742) R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight3) gv1274: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape781: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc742, gv1274, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc742) model_decoder_layers_7_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] model_decoder_layers_7_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[659] gv1275: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc743: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1275, R.dtype("float16")) _741: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_v_proj_weight3, alloc740, model_decoder_layers_7_self_attn_v_proj_bias3, alloc743) R.vm.kill_object(alloc740) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias3) gv1276: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape782: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc743, gv1276, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc743) gv1277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc744: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1277, R.dtype("float16")) cls.concatenate(reshape780, reshape781, reshape782, alloc744) R.vm.kill_object(reshape780) R.vm.kill_object(reshape781) R.vm.kill_object(reshape782) gv1278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape783: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc744, gv1278, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc744) gv1279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc745: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1279, R.dtype("float16")) _743: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape783, alloc745) R.vm.kill_object(reshape783) gv1280: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape784: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc745, gv1280, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc745) gv1281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape785: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape784, gv1281, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape784) model_decoder_layers_7_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] model_decoder_layers_7_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[663] gv1282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc746: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1282, R.dtype("float16")) _744: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_out_proj_weight3, reshape785, model_decoder_layers_7_self_attn_out_proj_bias3, alloc746) R.vm.kill_object(reshape785) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias3) gv1283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc747: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1283, R.dtype("float16")) cls.add(alloc739, alloc746, alloc747) R.vm.kill_object(alloc739) R.vm.kill_object(alloc746) model_decoder_layers_7_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[673] model_decoder_layers_7_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[674] gv1284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc748: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1284, R.dtype("float16")) cls.layer_norm(alloc747, model_decoder_layers_7_encoder_attn_layer_norm_weight3, model_decoder_layers_7_encoder_attn_layer_norm_bias3, alloc748) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias3) model_decoder_layers_7_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] model_decoder_layers_7_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[670] gv1285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc749: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1285, R.dtype("float16")) _747: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight3, alloc748, model_decoder_layers_7_encoder_attn_q_proj_bias3, alloc749) R.vm.kill_object(alloc748) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias3) gv1286: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape786: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc749, gv1286, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc749) gv1287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape787: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape786, gv1287, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape786) gv1288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc750: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1288, R.dtype("float16")) _748: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape787, alloc750) R.vm.kill_object(reshape787) gv1289: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape788: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc750, gv1289, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc750) gv1290: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape789: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape788, gv1290, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape788) model_decoder_layers_7_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] model_decoder_layers_7_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[672] gv1291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc751: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1291, R.dtype("float16")) _749: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight3, reshape789, model_decoder_layers_7_encoder_attn_out_proj_bias3, alloc751) R.vm.kill_object(reshape789) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias3) gv1292: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc752: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1292, R.dtype("float16")) cls.add(alloc747, alloc751, alloc752) R.vm.kill_object(alloc747) R.vm.kill_object(alloc751) model_decoder_layers_7_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[679] model_decoder_layers_7_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[680] gv1293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc753: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1293, R.dtype("float16")) cls.layer_norm(alloc752, model_decoder_layers_7_final_layer_norm_weight3, model_decoder_layers_7_final_layer_norm_bias3, alloc753) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias3) model_decoder_layers_7_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] model_decoder_layers_7_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[676] gv1294: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc754: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1294, R.dtype("float16")) _752: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_7_fc1_weight3, alloc753, model_decoder_layers_7_fc1_bias3, alloc754) R.vm.kill_object(alloc753) R.vm.kill_object(model_decoder_layers_7_fc1_weight3) R.vm.kill_object(model_decoder_layers_7_fc1_bias3) model_decoder_layers_7_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] model_decoder_layers_7_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[678] gv1295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc755: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1295, R.dtype("float16")) _753: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_7_fc2_weight3, alloc754, model_decoder_layers_7_fc2_bias3, alloc755) R.vm.kill_object(alloc754) R.vm.kill_object(model_decoder_layers_7_fc2_weight3) R.vm.kill_object(model_decoder_layers_7_fc2_bias3) gv1296: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc756: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1296, R.dtype("float16")) cls.add(alloc752, alloc755, alloc756) R.vm.kill_object(alloc752) R.vm.kill_object(alloc755) model_decoder_layers_8_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[688] model_decoder_layers_8_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[689] gv1297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc757: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1297, R.dtype("float16")) cls.layer_norm(alloc756, model_decoder_layers_8_self_attn_layer_norm_weight3, model_decoder_layers_8_self_attn_layer_norm_bias3, alloc757) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias3) model_decoder_layers_8_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] model_decoder_layers_8_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[685] gv1298: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc758: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1298, R.dtype("float16")) _756: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_q_proj_weight3, alloc757, model_decoder_layers_8_self_attn_q_proj_bias3, alloc758) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias3) gv1299: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape790: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc758, gv1299, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc758) model_decoder_layers_8_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] gv1300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc759: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1300, R.dtype("float16")) _757: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_8_self_attn_k_proj_weight3, alloc757, alloc759) R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight3) gv1301: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape791: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc759, gv1301, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc759) model_decoder_layers_8_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] model_decoder_layers_8_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[683] gv1302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc760: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1302, R.dtype("float16")) _758: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_v_proj_weight3, alloc757, model_decoder_layers_8_self_attn_v_proj_bias3, alloc760) R.vm.kill_object(alloc757) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias3) gv1303: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape792: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc760, gv1303, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc760) gv1304: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc761: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1304, R.dtype("float16")) cls.concatenate(reshape790, reshape791, reshape792, alloc761) R.vm.kill_object(reshape790) R.vm.kill_object(reshape791) R.vm.kill_object(reshape792) gv1305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape793: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc761, gv1305, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc761) gv1306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc762: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1306, R.dtype("float16")) _760: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape793, alloc762) R.vm.kill_object(reshape793) gv1307: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape794: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc762, gv1307, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc762) gv1308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape795: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape794, gv1308, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape794) model_decoder_layers_8_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] model_decoder_layers_8_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[687] gv1309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc763: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1309, R.dtype("float16")) _761: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_out_proj_weight3, reshape795, model_decoder_layers_8_self_attn_out_proj_bias3, alloc763) R.vm.kill_object(reshape795) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias3) gv1310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc764: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1310, R.dtype("float16")) cls.add(alloc756, alloc763, alloc764) R.vm.kill_object(alloc756) R.vm.kill_object(alloc763) model_decoder_layers_8_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[697] model_decoder_layers_8_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[698] gv1311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc765: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1311, R.dtype("float16")) cls.layer_norm(alloc764, model_decoder_layers_8_encoder_attn_layer_norm_weight3, model_decoder_layers_8_encoder_attn_layer_norm_bias3, alloc765) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias3) model_decoder_layers_8_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] model_decoder_layers_8_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[694] gv1312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc766: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1312, R.dtype("float16")) _764: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight3, alloc765, model_decoder_layers_8_encoder_attn_q_proj_bias3, alloc766) R.vm.kill_object(alloc765) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias3) gv1313: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape796: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc766, gv1313, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc766) gv1314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape797: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape796, gv1314, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape796) gv1315: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc767: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1315, R.dtype("float16")) _765: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape797, alloc767) R.vm.kill_object(reshape797) gv1316: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape798: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc767, gv1316, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc767) gv1317: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape799: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape798, gv1317, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape798) model_decoder_layers_8_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] model_decoder_layers_8_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[696] gv1318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc768: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1318, R.dtype("float16")) _766: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight3, reshape799, model_decoder_layers_8_encoder_attn_out_proj_bias3, alloc768) R.vm.kill_object(reshape799) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias3) gv1319: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc769: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1319, R.dtype("float16")) cls.add(alloc764, alloc768, alloc769) R.vm.kill_object(alloc764) R.vm.kill_object(alloc768) model_decoder_layers_8_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[703] model_decoder_layers_8_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[704] gv1320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc770: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1320, R.dtype("float16")) cls.layer_norm(alloc769, model_decoder_layers_8_final_layer_norm_weight3, model_decoder_layers_8_final_layer_norm_bias3, alloc770) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias3) model_decoder_layers_8_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] model_decoder_layers_8_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[700] gv1321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc771: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1321, R.dtype("float16")) _769: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_8_fc1_weight3, alloc770, model_decoder_layers_8_fc1_bias3, alloc771) R.vm.kill_object(alloc770) R.vm.kill_object(model_decoder_layers_8_fc1_weight3) R.vm.kill_object(model_decoder_layers_8_fc1_bias3) model_decoder_layers_8_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] model_decoder_layers_8_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[702] gv1322: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc772: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1322, R.dtype("float16")) _770: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_8_fc2_weight3, alloc771, model_decoder_layers_8_fc2_bias3, alloc772) R.vm.kill_object(alloc771) R.vm.kill_object(model_decoder_layers_8_fc2_weight3) R.vm.kill_object(model_decoder_layers_8_fc2_bias3) gv1323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc773: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1323, R.dtype("float16")) cls.add(alloc769, alloc772, alloc773) R.vm.kill_object(alloc769) R.vm.kill_object(alloc772) model_decoder_layers_9_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[712] model_decoder_layers_9_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[713] gv1324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc774: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1324, R.dtype("float16")) cls.layer_norm(alloc773, model_decoder_layers_9_self_attn_layer_norm_weight3, model_decoder_layers_9_self_attn_layer_norm_bias3, alloc774) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias3) model_decoder_layers_9_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] model_decoder_layers_9_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[709] gv1325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc775: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1325, R.dtype("float16")) _773: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_q_proj_weight3, alloc774, model_decoder_layers_9_self_attn_q_proj_bias3, alloc775) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias3) gv1326: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape800: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc775, gv1326, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc775) model_decoder_layers_9_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] gv1327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc776: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1327, R.dtype("float16")) _774: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_9_self_attn_k_proj_weight3, alloc774, alloc776) R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight3) gv1328: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape801: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc776, gv1328, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc776) model_decoder_layers_9_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] model_decoder_layers_9_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[707] gv1329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc777: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1329, R.dtype("float16")) _775: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_v_proj_weight3, alloc774, model_decoder_layers_9_self_attn_v_proj_bias3, alloc777) R.vm.kill_object(alloc774) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias3) gv1330: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape802: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc777, gv1330, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc777) gv1331: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc778: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1331, R.dtype("float16")) cls.concatenate(reshape800, reshape801, reshape802, alloc778) R.vm.kill_object(reshape800) R.vm.kill_object(reshape801) R.vm.kill_object(reshape802) gv1332: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape803: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc778, gv1332, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc778) gv1333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc779: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1333, R.dtype("float16")) _777: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape803, alloc779) R.vm.kill_object(reshape803) gv1334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape804: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc779, gv1334, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc779) gv1335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape805: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape804, gv1335, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape804) model_decoder_layers_9_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] model_decoder_layers_9_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[711] gv1336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc780: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1336, R.dtype("float16")) _778: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_out_proj_weight3, reshape805, model_decoder_layers_9_self_attn_out_proj_bias3, alloc780) R.vm.kill_object(reshape805) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias3) gv1337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc781: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1337, R.dtype("float16")) cls.add(alloc773, alloc780, alloc781) R.vm.kill_object(alloc773) R.vm.kill_object(alloc780) model_decoder_layers_9_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[721] model_decoder_layers_9_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[722] gv1338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc782: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1338, R.dtype("float16")) cls.layer_norm(alloc781, model_decoder_layers_9_encoder_attn_layer_norm_weight3, model_decoder_layers_9_encoder_attn_layer_norm_bias3, alloc782) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias3) model_decoder_layers_9_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] model_decoder_layers_9_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[718] gv1339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc783: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1339, R.dtype("float16")) _781: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight3, alloc782, model_decoder_layers_9_encoder_attn_q_proj_bias3, alloc783) R.vm.kill_object(alloc782) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias3) gv1340: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape806: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc783, gv1340, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc783) gv1341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape807: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape806, gv1341, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape806) gv1342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc784: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1342, R.dtype("float16")) _782: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape807, alloc784) R.vm.kill_object(reshape807) gv1343: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape808: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc784, gv1343, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc784) gv1344: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape809: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape808, gv1344, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape808) model_decoder_layers_9_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] model_decoder_layers_9_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[720] gv1345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc785: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1345, R.dtype("float16")) _783: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight3, reshape809, model_decoder_layers_9_encoder_attn_out_proj_bias3, alloc785) R.vm.kill_object(reshape809) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias3) gv1346: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc786: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1346, R.dtype("float16")) cls.add(alloc781, alloc785, alloc786) R.vm.kill_object(alloc781) R.vm.kill_object(alloc785) model_decoder_layers_9_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[727] model_decoder_layers_9_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[728] gv1347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc787: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1347, R.dtype("float16")) cls.layer_norm(alloc786, model_decoder_layers_9_final_layer_norm_weight3, model_decoder_layers_9_final_layer_norm_bias3, alloc787) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias3) model_decoder_layers_9_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] model_decoder_layers_9_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[724] gv1348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc788: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1348, R.dtype("float16")) _786: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_9_fc1_weight3, alloc787, model_decoder_layers_9_fc1_bias3, alloc788) R.vm.kill_object(alloc787) R.vm.kill_object(model_decoder_layers_9_fc1_weight3) R.vm.kill_object(model_decoder_layers_9_fc1_bias3) model_decoder_layers_9_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] model_decoder_layers_9_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[726] gv1349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc789: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1349, R.dtype("float16")) _787: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_9_fc2_weight3, alloc788, model_decoder_layers_9_fc2_bias3, alloc789) R.vm.kill_object(alloc788) R.vm.kill_object(model_decoder_layers_9_fc2_weight3) R.vm.kill_object(model_decoder_layers_9_fc2_bias3) gv1350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc790: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1350, R.dtype("float16")) cls.add(alloc786, alloc789, alloc790) R.vm.kill_object(alloc786) R.vm.kill_object(alloc789) model_decoder_layers_10_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[736] model_decoder_layers_10_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[737] gv1351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc791: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1351, R.dtype("float16")) cls.layer_norm(alloc790, model_decoder_layers_10_self_attn_layer_norm_weight3, model_decoder_layers_10_self_attn_layer_norm_bias3, alloc791) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias3) model_decoder_layers_10_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] model_decoder_layers_10_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[733] gv1352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc792: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1352, R.dtype("float16")) _790: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_q_proj_weight3, alloc791, model_decoder_layers_10_self_attn_q_proj_bias3, alloc792) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias3) gv1353: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape810: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc792, gv1353, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc792) model_decoder_layers_10_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] gv1354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc793: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1354, R.dtype("float16")) _791: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_10_self_attn_k_proj_weight3, alloc791, alloc793) R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight3) gv1355: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape811: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc793, gv1355, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc793) model_decoder_layers_10_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] model_decoder_layers_10_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[731] gv1356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc794: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1356, R.dtype("float16")) _792: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_v_proj_weight3, alloc791, model_decoder_layers_10_self_attn_v_proj_bias3, alloc794) R.vm.kill_object(alloc791) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias3) gv1357: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape812: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc794, gv1357, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc794) gv1358: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc795: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1358, R.dtype("float16")) cls.concatenate(reshape810, reshape811, reshape812, alloc795) R.vm.kill_object(reshape810) R.vm.kill_object(reshape811) R.vm.kill_object(reshape812) gv1359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape813: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc795, gv1359, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc795) gv1360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc796: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1360, R.dtype("float16")) _794: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape813, alloc796) R.vm.kill_object(reshape813) gv1361: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape814: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc796, gv1361, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc796) gv1362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape815: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape814, gv1362, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape814) model_decoder_layers_10_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] model_decoder_layers_10_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[735] gv1363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc797: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1363, R.dtype("float16")) _795: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_out_proj_weight3, reshape815, model_decoder_layers_10_self_attn_out_proj_bias3, alloc797) R.vm.kill_object(reshape815) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias3) gv1364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc798: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1364, R.dtype("float16")) cls.add(alloc790, alloc797, alloc798) R.vm.kill_object(alloc790) R.vm.kill_object(alloc797) model_decoder_layers_10_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[745] model_decoder_layers_10_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[746] gv1365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc799: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1365, R.dtype("float16")) cls.layer_norm(alloc798, model_decoder_layers_10_encoder_attn_layer_norm_weight3, model_decoder_layers_10_encoder_attn_layer_norm_bias3, alloc799) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias3) model_decoder_layers_10_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] model_decoder_layers_10_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[742] gv1366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc800: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1366, R.dtype("float16")) _798: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight3, alloc799, model_decoder_layers_10_encoder_attn_q_proj_bias3, alloc800) R.vm.kill_object(alloc799) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias3) gv1367: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape816: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc800, gv1367, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc800) gv1368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape817: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape816, gv1368, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape816) gv1369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc801: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1369, R.dtype("float16")) _799: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape817, alloc801) R.vm.kill_object(reshape817) gv1370: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape818: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc801, gv1370, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc801) gv1371: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape819: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape818, gv1371, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape818) model_decoder_layers_10_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] model_decoder_layers_10_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[744] gv1372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc802: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1372, R.dtype("float16")) _800: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight3, reshape819, model_decoder_layers_10_encoder_attn_out_proj_bias3, alloc802) R.vm.kill_object(reshape819) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias3) gv1373: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc803: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1373, R.dtype("float16")) cls.add(alloc798, alloc802, alloc803) R.vm.kill_object(alloc798) R.vm.kill_object(alloc802) model_decoder_layers_10_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[751] model_decoder_layers_10_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[752] gv1374: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc804: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1374, R.dtype("float16")) cls.layer_norm(alloc803, model_decoder_layers_10_final_layer_norm_weight3, model_decoder_layers_10_final_layer_norm_bias3, alloc804) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias3) model_decoder_layers_10_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] model_decoder_layers_10_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[748] gv1375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc805: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1375, R.dtype("float16")) _803: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_10_fc1_weight3, alloc804, model_decoder_layers_10_fc1_bias3, alloc805) R.vm.kill_object(alloc804) R.vm.kill_object(model_decoder_layers_10_fc1_weight3) R.vm.kill_object(model_decoder_layers_10_fc1_bias3) model_decoder_layers_10_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] model_decoder_layers_10_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[750] gv1376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc806: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1376, R.dtype("float16")) _804: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_10_fc2_weight3, alloc805, model_decoder_layers_10_fc2_bias3, alloc806) R.vm.kill_object(alloc805) R.vm.kill_object(model_decoder_layers_10_fc2_weight3) R.vm.kill_object(model_decoder_layers_10_fc2_bias3) gv1377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc807: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1377, R.dtype("float16")) cls.add(alloc803, alloc806, alloc807) R.vm.kill_object(alloc803) R.vm.kill_object(alloc806) model_decoder_layers_11_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[760] model_decoder_layers_11_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[761] gv1378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc808: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1378, R.dtype("float16")) cls.layer_norm(alloc807, model_decoder_layers_11_self_attn_layer_norm_weight3, model_decoder_layers_11_self_attn_layer_norm_bias3, alloc808) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias3) model_decoder_layers_11_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] model_decoder_layers_11_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[757] gv1379: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc809: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1379, R.dtype("float16")) _807: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_q_proj_weight3, alloc808, model_decoder_layers_11_self_attn_q_proj_bias3, alloc809) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias3) gv1380: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape820: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc809, gv1380, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc809) model_decoder_layers_11_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] gv1381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc810: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1381, R.dtype("float16")) _808: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_11_self_attn_k_proj_weight3, alloc808, alloc810) R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight3) gv1382: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape821: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc810, gv1382, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc810) model_decoder_layers_11_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] model_decoder_layers_11_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[755] gv1383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc811: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1383, R.dtype("float16")) _809: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_v_proj_weight3, alloc808, model_decoder_layers_11_self_attn_v_proj_bias3, alloc811) R.vm.kill_object(alloc808) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias3) gv1384: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape822: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc811, gv1384, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc811) gv1385: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc812: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1385, R.dtype("float16")) cls.concatenate(reshape820, reshape821, reshape822, alloc812) R.vm.kill_object(reshape820) R.vm.kill_object(reshape821) R.vm.kill_object(reshape822) gv1386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape823: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc812, gv1386, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc812) gv1387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc813: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1387, R.dtype("float16")) _811: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape823, alloc813) R.vm.kill_object(reshape823) gv1388: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape824: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc813, gv1388, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc813) gv1389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape825: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape824, gv1389, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape824) model_decoder_layers_11_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] model_decoder_layers_11_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[759] gv1390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc814: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1390, R.dtype("float16")) _812: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_out_proj_weight3, reshape825, model_decoder_layers_11_self_attn_out_proj_bias3, alloc814) R.vm.kill_object(reshape825) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias3) gv1391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc815: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1391, R.dtype("float16")) cls.add(alloc807, alloc814, alloc815) R.vm.kill_object(alloc807) R.vm.kill_object(alloc814) model_decoder_layers_11_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[769] model_decoder_layers_11_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[770] gv1392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc816: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1392, R.dtype("float16")) cls.layer_norm(alloc815, model_decoder_layers_11_encoder_attn_layer_norm_weight3, model_decoder_layers_11_encoder_attn_layer_norm_bias3, alloc816) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias3) model_decoder_layers_11_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] model_decoder_layers_11_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[766] gv1393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc817: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1393, R.dtype("float16")) _815: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight3, alloc816, model_decoder_layers_11_encoder_attn_q_proj_bias3, alloc817) R.vm.kill_object(alloc816) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias3) gv1394: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape826: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc817, gv1394, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc817) gv1395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape827: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape826, gv1395, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape826) gv1396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc818: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1396, R.dtype("float16")) _816: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape827, alloc818) R.vm.kill_object(reshape827) gv1397: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape828: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc818, gv1397, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc818) gv1398: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape829: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape828, gv1398, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape828) model_decoder_layers_11_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] model_decoder_layers_11_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[768] gv1399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc819: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1399, R.dtype("float16")) _817: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight3, reshape829, model_decoder_layers_11_encoder_attn_out_proj_bias3, alloc819) R.vm.kill_object(reshape829) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias3) gv1400: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc820: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1400, R.dtype("float16")) cls.add(alloc815, alloc819, alloc820) R.vm.kill_object(alloc815) R.vm.kill_object(alloc819) model_decoder_layers_11_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[775] model_decoder_layers_11_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[776] gv1401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc821: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1401, R.dtype("float16")) cls.layer_norm(alloc820, model_decoder_layers_11_final_layer_norm_weight3, model_decoder_layers_11_final_layer_norm_bias3, alloc821) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias3) model_decoder_layers_11_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] model_decoder_layers_11_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[772] gv1402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc822: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1402, R.dtype("float16")) _820: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_11_fc1_weight3, alloc821, model_decoder_layers_11_fc1_bias3, alloc822) R.vm.kill_object(alloc821) R.vm.kill_object(model_decoder_layers_11_fc1_weight3) R.vm.kill_object(model_decoder_layers_11_fc1_bias3) model_decoder_layers_11_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] model_decoder_layers_11_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[774] gv1403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc823: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1403, R.dtype("float16")) _821: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_11_fc2_weight3, alloc822, model_decoder_layers_11_fc2_bias3, alloc823) R.vm.kill_object(alloc822) R.vm.kill_object(model_decoder_layers_11_fc2_weight3) R.vm.kill_object(model_decoder_layers_11_fc2_bias3) gv1404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc824: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1404, R.dtype("float16")) cls.add(alloc820, alloc823, alloc824) R.vm.kill_object(alloc820) R.vm.kill_object(alloc823) model_decoder_layers_12_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[784] model_decoder_layers_12_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[785] gv1405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc825: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1405, R.dtype("float16")) cls.layer_norm(alloc824, model_decoder_layers_12_self_attn_layer_norm_weight3, model_decoder_layers_12_self_attn_layer_norm_bias3, alloc825) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias3) model_decoder_layers_12_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] model_decoder_layers_12_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[781] gv1406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc826: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1406, R.dtype("float16")) _824: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_q_proj_weight3, alloc825, model_decoder_layers_12_self_attn_q_proj_bias3, alloc826) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias3) gv1407: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape830: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc826, gv1407, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc826) model_decoder_layers_12_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] gv1408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc827: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1408, R.dtype("float16")) _825: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_12_self_attn_k_proj_weight3, alloc825, alloc827) R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight3) gv1409: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape831: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc827, gv1409, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc827) model_decoder_layers_12_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] model_decoder_layers_12_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[779] gv1410: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc828: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1410, R.dtype("float16")) _826: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_v_proj_weight3, alloc825, model_decoder_layers_12_self_attn_v_proj_bias3, alloc828) R.vm.kill_object(alloc825) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias3) gv1411: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape832: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc828, gv1411, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc828) gv1412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc829: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1412, R.dtype("float16")) cls.concatenate(reshape830, reshape831, reshape832, alloc829) R.vm.kill_object(reshape830) R.vm.kill_object(reshape831) R.vm.kill_object(reshape832) gv1413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape833: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc829, gv1413, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc829) gv1414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc830: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1414, R.dtype("float16")) _828: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape833, alloc830) R.vm.kill_object(reshape833) gv1415: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape834: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc830, gv1415, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc830) gv1416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape835: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape834, gv1416, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape834) model_decoder_layers_12_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] model_decoder_layers_12_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[783] gv1417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc831: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1417, R.dtype("float16")) _829: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_out_proj_weight3, reshape835, model_decoder_layers_12_self_attn_out_proj_bias3, alloc831) R.vm.kill_object(reshape835) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias3) gv1418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc832: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1418, R.dtype("float16")) cls.add(alloc824, alloc831, alloc832) R.vm.kill_object(alloc824) R.vm.kill_object(alloc831) model_decoder_layers_12_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[793] model_decoder_layers_12_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[794] gv1419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc833: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1419, R.dtype("float16")) cls.layer_norm(alloc832, model_decoder_layers_12_encoder_attn_layer_norm_weight3, model_decoder_layers_12_encoder_attn_layer_norm_bias3, alloc833) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias3) model_decoder_layers_12_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] model_decoder_layers_12_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[790] gv1420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc834: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1420, R.dtype("float16")) _832: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight3, alloc833, model_decoder_layers_12_encoder_attn_q_proj_bias3, alloc834) R.vm.kill_object(alloc833) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias3) gv1421: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape836: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc834, gv1421, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc834) gv1422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape837: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape836, gv1422, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape836) gv1423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc835: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1423, R.dtype("float16")) _833: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape837, alloc835) R.vm.kill_object(reshape837) gv1424: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape838: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc835, gv1424, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc835) gv1425: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape839: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape838, gv1425, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape838) model_decoder_layers_12_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] model_decoder_layers_12_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[792] gv1426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc836: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1426, R.dtype("float16")) _834: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight3, reshape839, model_decoder_layers_12_encoder_attn_out_proj_bias3, alloc836) R.vm.kill_object(reshape839) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias3) gv1427: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc837: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1427, R.dtype("float16")) cls.add(alloc832, alloc836, alloc837) R.vm.kill_object(alloc832) R.vm.kill_object(alloc836) model_decoder_layers_12_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[799] model_decoder_layers_12_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[800] gv1428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc838: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1428, R.dtype("float16")) cls.layer_norm(alloc837, model_decoder_layers_12_final_layer_norm_weight3, model_decoder_layers_12_final_layer_norm_bias3, alloc838) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias3) model_decoder_layers_12_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] model_decoder_layers_12_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[796] gv1429: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc839: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1429, R.dtype("float16")) _837: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_12_fc1_weight3, alloc838, model_decoder_layers_12_fc1_bias3, alloc839) R.vm.kill_object(alloc838) R.vm.kill_object(model_decoder_layers_12_fc1_weight3) R.vm.kill_object(model_decoder_layers_12_fc1_bias3) model_decoder_layers_12_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] model_decoder_layers_12_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[798] gv1430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc840: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1430, R.dtype("float16")) _838: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_12_fc2_weight3, alloc839, model_decoder_layers_12_fc2_bias3, alloc840) R.vm.kill_object(alloc839) R.vm.kill_object(model_decoder_layers_12_fc2_weight3) R.vm.kill_object(model_decoder_layers_12_fc2_bias3) gv1431: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc841: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1431, R.dtype("float16")) cls.add(alloc837, alloc840, alloc841) R.vm.kill_object(alloc837) R.vm.kill_object(alloc840) model_decoder_layers_13_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[808] model_decoder_layers_13_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[809] gv1432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc842: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1432, R.dtype("float16")) cls.layer_norm(alloc841, model_decoder_layers_13_self_attn_layer_norm_weight3, model_decoder_layers_13_self_attn_layer_norm_bias3, alloc842) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias3) model_decoder_layers_13_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] model_decoder_layers_13_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[805] gv1433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc843: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1433, R.dtype("float16")) _841: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_q_proj_weight3, alloc842, model_decoder_layers_13_self_attn_q_proj_bias3, alloc843) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias3) gv1434: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape840: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc843, gv1434, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc843) model_decoder_layers_13_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] gv1435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc844: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1435, R.dtype("float16")) _842: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_13_self_attn_k_proj_weight3, alloc842, alloc844) R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight3) gv1436: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape841: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc844, gv1436, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc844) model_decoder_layers_13_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] model_decoder_layers_13_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[803] gv1437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc845: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1437, R.dtype("float16")) _843: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_v_proj_weight3, alloc842, model_decoder_layers_13_self_attn_v_proj_bias3, alloc845) R.vm.kill_object(alloc842) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias3) gv1438: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape842: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc845, gv1438, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc845) gv1439: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc846: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1439, R.dtype("float16")) cls.concatenate(reshape840, reshape841, reshape842, alloc846) R.vm.kill_object(reshape840) R.vm.kill_object(reshape841) R.vm.kill_object(reshape842) gv1440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape843: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc846, gv1440, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc846) gv1441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc847: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1441, R.dtype("float16")) _845: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape843, alloc847) R.vm.kill_object(reshape843) gv1442: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape844: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc847, gv1442, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc847) gv1443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape845: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape844, gv1443, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape844) model_decoder_layers_13_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] model_decoder_layers_13_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[807] gv1444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc848: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1444, R.dtype("float16")) _846: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_out_proj_weight3, reshape845, model_decoder_layers_13_self_attn_out_proj_bias3, alloc848) R.vm.kill_object(reshape845) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias3) gv1445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc849: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1445, R.dtype("float16")) cls.add(alloc841, alloc848, alloc849) R.vm.kill_object(alloc841) R.vm.kill_object(alloc848) model_decoder_layers_13_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[817] model_decoder_layers_13_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[818] gv1446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc850: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1446, R.dtype("float16")) cls.layer_norm(alloc849, model_decoder_layers_13_encoder_attn_layer_norm_weight3, model_decoder_layers_13_encoder_attn_layer_norm_bias3, alloc850) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias3) model_decoder_layers_13_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] model_decoder_layers_13_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[814] gv1447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc851: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1447, R.dtype("float16")) _849: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight3, alloc850, model_decoder_layers_13_encoder_attn_q_proj_bias3, alloc851) R.vm.kill_object(alloc850) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias3) gv1448: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape846: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc851, gv1448, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc851) gv1449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape847: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape846, gv1449, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape846) gv1450: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc852: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1450, R.dtype("float16")) _850: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape847, alloc852) R.vm.kill_object(reshape847) gv1451: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape848: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc852, gv1451, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc852) gv1452: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape849: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape848, gv1452, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape848) model_decoder_layers_13_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] model_decoder_layers_13_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[816] gv1453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc853: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1453, R.dtype("float16")) _851: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight3, reshape849, model_decoder_layers_13_encoder_attn_out_proj_bias3, alloc853) R.vm.kill_object(reshape849) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias3) gv1454: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc854: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1454, R.dtype("float16")) cls.add(alloc849, alloc853, alloc854) R.vm.kill_object(alloc849) R.vm.kill_object(alloc853) model_decoder_layers_13_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[823] model_decoder_layers_13_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[824] gv1455: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc855: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1455, R.dtype("float16")) cls.layer_norm(alloc854, model_decoder_layers_13_final_layer_norm_weight3, model_decoder_layers_13_final_layer_norm_bias3, alloc855) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias3) model_decoder_layers_13_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] model_decoder_layers_13_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[820] gv1456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc856: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1456, R.dtype("float16")) _854: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_13_fc1_weight3, alloc855, model_decoder_layers_13_fc1_bias3, alloc856) R.vm.kill_object(alloc855) R.vm.kill_object(model_decoder_layers_13_fc1_weight3) R.vm.kill_object(model_decoder_layers_13_fc1_bias3) model_decoder_layers_13_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] model_decoder_layers_13_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[822] gv1457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc857: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1457, R.dtype("float16")) _855: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_13_fc2_weight3, alloc856, model_decoder_layers_13_fc2_bias3, alloc857) R.vm.kill_object(alloc856) R.vm.kill_object(model_decoder_layers_13_fc2_weight3) R.vm.kill_object(model_decoder_layers_13_fc2_bias3) gv1458: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc858: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1458, R.dtype("float16")) cls.add(alloc854, alloc857, alloc858) R.vm.kill_object(alloc854) R.vm.kill_object(alloc857) model_decoder_layers_14_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[832] model_decoder_layers_14_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[833] gv1459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc859: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1459, R.dtype("float16")) cls.layer_norm(alloc858, model_decoder_layers_14_self_attn_layer_norm_weight3, model_decoder_layers_14_self_attn_layer_norm_bias3, alloc859) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias3) model_decoder_layers_14_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] model_decoder_layers_14_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[829] gv1460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc860: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1460, R.dtype("float16")) _858: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_q_proj_weight3, alloc859, model_decoder_layers_14_self_attn_q_proj_bias3, alloc860) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias3) gv1461: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape850: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc860, gv1461, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc860) model_decoder_layers_14_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] gv1462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc861: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1462, R.dtype("float16")) _859: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_14_self_attn_k_proj_weight3, alloc859, alloc861) R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight3) gv1463: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape851: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc861, gv1463, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc861) model_decoder_layers_14_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] model_decoder_layers_14_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[827] gv1464: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc862: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1464, R.dtype("float16")) _860: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_v_proj_weight3, alloc859, model_decoder_layers_14_self_attn_v_proj_bias3, alloc862) R.vm.kill_object(alloc859) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias3) gv1465: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape852: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc862, gv1465, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc862) gv1466: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc863: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1466, R.dtype("float16")) cls.concatenate(reshape850, reshape851, reshape852, alloc863) R.vm.kill_object(reshape850) R.vm.kill_object(reshape851) R.vm.kill_object(reshape852) gv1467: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape853: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc863, gv1467, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc863) gv1468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc864: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1468, R.dtype("float16")) _862: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape853, alloc864) R.vm.kill_object(reshape853) gv1469: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape854: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc864, gv1469, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc864) gv1470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape855: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape854, gv1470, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape854) model_decoder_layers_14_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] model_decoder_layers_14_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[831] gv1471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc865: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1471, R.dtype("float16")) _863: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_out_proj_weight3, reshape855, model_decoder_layers_14_self_attn_out_proj_bias3, alloc865) R.vm.kill_object(reshape855) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias3) gv1472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc866: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1472, R.dtype("float16")) cls.add(alloc858, alloc865, alloc866) R.vm.kill_object(alloc858) R.vm.kill_object(alloc865) model_decoder_layers_14_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[841] model_decoder_layers_14_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[842] gv1473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc867: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1473, R.dtype("float16")) cls.layer_norm(alloc866, model_decoder_layers_14_encoder_attn_layer_norm_weight3, model_decoder_layers_14_encoder_attn_layer_norm_bias3, alloc867) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias3) model_decoder_layers_14_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] model_decoder_layers_14_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[838] gv1474: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc868: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1474, R.dtype("float16")) _866: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight3, alloc867, model_decoder_layers_14_encoder_attn_q_proj_bias3, alloc868) R.vm.kill_object(alloc867) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias3) gv1475: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape856: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc868, gv1475, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc868) gv1476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape857: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape856, gv1476, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape856) gv1477: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc869: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1477, R.dtype("float16")) _867: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape857, alloc869) R.vm.kill_object(reshape857) gv1478: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape858: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc869, gv1478, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc869) gv1479: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape859: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape858, gv1479, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape858) model_decoder_layers_14_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] model_decoder_layers_14_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[840] gv1480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc870: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1480, R.dtype("float16")) _868: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight3, reshape859, model_decoder_layers_14_encoder_attn_out_proj_bias3, alloc870) R.vm.kill_object(reshape859) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias3) gv1481: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc871: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1481, R.dtype("float16")) cls.add(alloc866, alloc870, alloc871) R.vm.kill_object(alloc866) R.vm.kill_object(alloc870) model_decoder_layers_14_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[847] model_decoder_layers_14_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[848] gv1482: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc872: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1482, R.dtype("float16")) cls.layer_norm(alloc871, model_decoder_layers_14_final_layer_norm_weight3, model_decoder_layers_14_final_layer_norm_bias3, alloc872) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias3) model_decoder_layers_14_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] model_decoder_layers_14_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[844] gv1483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc873: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1483, R.dtype("float16")) _871: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_14_fc1_weight3, alloc872, model_decoder_layers_14_fc1_bias3, alloc873) R.vm.kill_object(alloc872) R.vm.kill_object(model_decoder_layers_14_fc1_weight3) R.vm.kill_object(model_decoder_layers_14_fc1_bias3) model_decoder_layers_14_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] model_decoder_layers_14_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[846] gv1484: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc874: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1484, R.dtype("float16")) _872: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_14_fc2_weight3, alloc873, model_decoder_layers_14_fc2_bias3, alloc874) R.vm.kill_object(alloc873) R.vm.kill_object(model_decoder_layers_14_fc2_weight3) R.vm.kill_object(model_decoder_layers_14_fc2_bias3) gv1485: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc875: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1485, R.dtype("float16")) cls.add(alloc871, alloc874, alloc875) R.vm.kill_object(alloc871) R.vm.kill_object(alloc874) model_decoder_layers_15_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[856] model_decoder_layers_15_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[857] gv1486: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc876: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1486, R.dtype("float16")) cls.layer_norm(alloc875, model_decoder_layers_15_self_attn_layer_norm_weight3, model_decoder_layers_15_self_attn_layer_norm_bias3, alloc876) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias3) model_decoder_layers_15_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] model_decoder_layers_15_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[853] gv1487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc877: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1487, R.dtype("float16")) _875: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_q_proj_weight3, alloc876, model_decoder_layers_15_self_attn_q_proj_bias3, alloc877) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias3) gv1488: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape860: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc877, gv1488, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc877) model_decoder_layers_15_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] gv1489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc878: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1489, R.dtype("float16")) _876: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_15_self_attn_k_proj_weight3, alloc876, alloc878) R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight3) gv1490: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape861: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc878, gv1490, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc878) model_decoder_layers_15_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] model_decoder_layers_15_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[851] gv1491: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc879: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1491, R.dtype("float16")) _877: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_v_proj_weight3, alloc876, model_decoder_layers_15_self_attn_v_proj_bias3, alloc879) R.vm.kill_object(alloc876) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias3) gv1492: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape862: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc879, gv1492, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc879) gv1493: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc880: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1493, R.dtype("float16")) cls.concatenate(reshape860, reshape861, reshape862, alloc880) R.vm.kill_object(reshape860) R.vm.kill_object(reshape861) R.vm.kill_object(reshape862) gv1494: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape863: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc880, gv1494, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc880) gv1495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc881: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1495, R.dtype("float16")) _879: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape863, alloc881) R.vm.kill_object(reshape863) gv1496: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape864: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc881, gv1496, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc881) gv1497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape865: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape864, gv1497, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape864) model_decoder_layers_15_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] model_decoder_layers_15_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[855] gv1498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc882: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1498, R.dtype("float16")) _880: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_out_proj_weight3, reshape865, model_decoder_layers_15_self_attn_out_proj_bias3, alloc882) R.vm.kill_object(reshape865) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias3) gv1499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc883: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1499, R.dtype("float16")) cls.add(alloc875, alloc882, alloc883) R.vm.kill_object(alloc875) R.vm.kill_object(alloc882) model_decoder_layers_15_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[865] model_decoder_layers_15_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[866] gv1500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc884: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1500, R.dtype("float16")) cls.layer_norm(alloc883, model_decoder_layers_15_encoder_attn_layer_norm_weight3, model_decoder_layers_15_encoder_attn_layer_norm_bias3, alloc884) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias3) model_decoder_layers_15_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] model_decoder_layers_15_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[862] gv1501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc885: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1501, R.dtype("float16")) _883: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight3, alloc884, model_decoder_layers_15_encoder_attn_q_proj_bias3, alloc885) R.vm.kill_object(alloc884) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias3) gv1502: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape866: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc885, gv1502, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc885) gv1503: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape867: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape866, gv1503, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape866) gv1504: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc886: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1504, R.dtype("float16")) _884: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape867, alloc886) R.vm.kill_object(reshape867) gv1505: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape868: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc886, gv1505, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc886) gv1506: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape869: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape868, gv1506, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape868) model_decoder_layers_15_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] model_decoder_layers_15_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[864] gv1507: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc887: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1507, R.dtype("float16")) _885: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight3, reshape869, model_decoder_layers_15_encoder_attn_out_proj_bias3, alloc887) R.vm.kill_object(reshape869) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias3) gv1508: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc888: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1508, R.dtype("float16")) cls.add(alloc883, alloc887, alloc888) R.vm.kill_object(alloc883) R.vm.kill_object(alloc887) model_decoder_layers_15_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[871] model_decoder_layers_15_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[872] gv1509: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc889: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1509, R.dtype("float16")) cls.layer_norm(alloc888, model_decoder_layers_15_final_layer_norm_weight3, model_decoder_layers_15_final_layer_norm_bias3, alloc889) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias3) model_decoder_layers_15_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] model_decoder_layers_15_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[868] gv1510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc890: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1510, R.dtype("float16")) _888: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_15_fc1_weight3, alloc889, model_decoder_layers_15_fc1_bias3, alloc890) R.vm.kill_object(alloc889) R.vm.kill_object(model_decoder_layers_15_fc1_weight3) R.vm.kill_object(model_decoder_layers_15_fc1_bias3) model_decoder_layers_15_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] model_decoder_layers_15_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[870] gv1511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc891: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1511, R.dtype("float16")) _889: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_15_fc2_weight3, alloc890, model_decoder_layers_15_fc2_bias3, alloc891) R.vm.kill_object(alloc890) R.vm.kill_object(model_decoder_layers_15_fc2_weight3) R.vm.kill_object(model_decoder_layers_15_fc2_bias3) gv1512: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc892: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1512, R.dtype("float16")) cls.add(alloc888, alloc891, alloc892) R.vm.kill_object(alloc888) R.vm.kill_object(alloc891) model_decoder_layers_16_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[880] model_decoder_layers_16_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[881] gv1513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc893: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1513, R.dtype("float16")) cls.layer_norm(alloc892, model_decoder_layers_16_self_attn_layer_norm_weight3, model_decoder_layers_16_self_attn_layer_norm_bias3, alloc893) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias3) model_decoder_layers_16_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] model_decoder_layers_16_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[877] gv1514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc894: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1514, R.dtype("float16")) _892: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_q_proj_weight3, alloc893, model_decoder_layers_16_self_attn_q_proj_bias3, alloc894) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias3) gv1515: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape870: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc894, gv1515, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc894) model_decoder_layers_16_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] gv1516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc895: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1516, R.dtype("float16")) _893: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_16_self_attn_k_proj_weight3, alloc893, alloc895) R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight3) gv1517: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape871: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc895, gv1517, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc895) model_decoder_layers_16_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] model_decoder_layers_16_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[875] gv1518: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc896: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1518, R.dtype("float16")) _894: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_v_proj_weight3, alloc893, model_decoder_layers_16_self_attn_v_proj_bias3, alloc896) R.vm.kill_object(alloc893) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias3) gv1519: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape872: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc896, gv1519, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc896) gv1520: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc897: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1520, R.dtype("float16")) cls.concatenate(reshape870, reshape871, reshape872, alloc897) R.vm.kill_object(reshape870) R.vm.kill_object(reshape871) R.vm.kill_object(reshape872) gv1521: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape873: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc897, gv1521, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc897) gv1522: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc898: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1522, R.dtype("float16")) _896: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape873, alloc898) R.vm.kill_object(reshape873) gv1523: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape874: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc898, gv1523, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc898) gv1524: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape875: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape874, gv1524, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape874) model_decoder_layers_16_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] model_decoder_layers_16_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[879] gv1525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc899: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1525, R.dtype("float16")) _897: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_out_proj_weight3, reshape875, model_decoder_layers_16_self_attn_out_proj_bias3, alloc899) R.vm.kill_object(reshape875) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias3) gv1526: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc900: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1526, R.dtype("float16")) cls.add(alloc892, alloc899, alloc900) R.vm.kill_object(alloc892) R.vm.kill_object(alloc899) model_decoder_layers_16_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[889] model_decoder_layers_16_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[890] gv1527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc901: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1527, R.dtype("float16")) cls.layer_norm(alloc900, model_decoder_layers_16_encoder_attn_layer_norm_weight3, model_decoder_layers_16_encoder_attn_layer_norm_bias3, alloc901) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias3) model_decoder_layers_16_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] model_decoder_layers_16_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[886] gv1528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc902: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1528, R.dtype("float16")) _900: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight3, alloc901, model_decoder_layers_16_encoder_attn_q_proj_bias3, alloc902) R.vm.kill_object(alloc901) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias3) gv1529: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape876: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc902, gv1529, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc902) gv1530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape877: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape876, gv1530, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape876) gv1531: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc903: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1531, R.dtype("float16")) _901: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape877, alloc903) R.vm.kill_object(reshape877) gv1532: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape878: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc903, gv1532, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc903) gv1533: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape879: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape878, gv1533, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape878) model_decoder_layers_16_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] model_decoder_layers_16_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[888] gv1534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc904: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1534, R.dtype("float16")) _902: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight3, reshape879, model_decoder_layers_16_encoder_attn_out_proj_bias3, alloc904) R.vm.kill_object(reshape879) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias3) gv1535: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc905: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1535, R.dtype("float16")) cls.add(alloc900, alloc904, alloc905) R.vm.kill_object(alloc900) R.vm.kill_object(alloc904) model_decoder_layers_16_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[895] model_decoder_layers_16_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[896] gv1536: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc906: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1536, R.dtype("float16")) cls.layer_norm(alloc905, model_decoder_layers_16_final_layer_norm_weight3, model_decoder_layers_16_final_layer_norm_bias3, alloc906) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias3) model_decoder_layers_16_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] model_decoder_layers_16_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[892] gv1537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc907: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1537, R.dtype("float16")) _905: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_16_fc1_weight3, alloc906, model_decoder_layers_16_fc1_bias3, alloc907) R.vm.kill_object(alloc906) R.vm.kill_object(model_decoder_layers_16_fc1_weight3) R.vm.kill_object(model_decoder_layers_16_fc1_bias3) model_decoder_layers_16_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] model_decoder_layers_16_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[894] gv1538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc908: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1538, R.dtype("float16")) _906: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_16_fc2_weight3, alloc907, model_decoder_layers_16_fc2_bias3, alloc908) R.vm.kill_object(alloc907) R.vm.kill_object(model_decoder_layers_16_fc2_weight3) R.vm.kill_object(model_decoder_layers_16_fc2_bias3) gv1539: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc909: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1539, R.dtype("float16")) cls.add(alloc905, alloc908, alloc909) R.vm.kill_object(alloc905) R.vm.kill_object(alloc908) model_decoder_layers_17_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[904] model_decoder_layers_17_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[905] gv1540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc910: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1540, R.dtype("float16")) cls.layer_norm(alloc909, model_decoder_layers_17_self_attn_layer_norm_weight3, model_decoder_layers_17_self_attn_layer_norm_bias3, alloc910) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias3) model_decoder_layers_17_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] model_decoder_layers_17_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[901] gv1541: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc911: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1541, R.dtype("float16")) _909: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_q_proj_weight3, alloc910, model_decoder_layers_17_self_attn_q_proj_bias3, alloc911) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias3) gv1542: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape880: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc911, gv1542, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc911) model_decoder_layers_17_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] gv1543: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc912: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1543, R.dtype("float16")) _910: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_17_self_attn_k_proj_weight3, alloc910, alloc912) R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight3) gv1544: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape881: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc912, gv1544, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc912) model_decoder_layers_17_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] model_decoder_layers_17_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[899] gv1545: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc913: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1545, R.dtype("float16")) _911: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_v_proj_weight3, alloc910, model_decoder_layers_17_self_attn_v_proj_bias3, alloc913) R.vm.kill_object(alloc910) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias3) gv1546: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape882: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc913, gv1546, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc913) gv1547: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc914: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1547, R.dtype("float16")) cls.concatenate(reshape880, reshape881, reshape882, alloc914) R.vm.kill_object(reshape880) R.vm.kill_object(reshape881) R.vm.kill_object(reshape882) gv1548: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape883: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc914, gv1548, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc914) gv1549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc915: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1549, R.dtype("float16")) _913: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape883, alloc915) R.vm.kill_object(reshape883) gv1550: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape884: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc915, gv1550, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc915) gv1551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape885: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape884, gv1551, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape884) model_decoder_layers_17_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] model_decoder_layers_17_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[903] gv1552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc916: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1552, R.dtype("float16")) _914: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_out_proj_weight3, reshape885, model_decoder_layers_17_self_attn_out_proj_bias3, alloc916) R.vm.kill_object(reshape885) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias3) gv1553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc917: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1553, R.dtype("float16")) cls.add(alloc909, alloc916, alloc917) R.vm.kill_object(alloc909) R.vm.kill_object(alloc916) model_decoder_layers_17_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[913] model_decoder_layers_17_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[914] gv1554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc918: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1554, R.dtype("float16")) cls.layer_norm(alloc917, model_decoder_layers_17_encoder_attn_layer_norm_weight3, model_decoder_layers_17_encoder_attn_layer_norm_bias3, alloc918) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias3) model_decoder_layers_17_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] model_decoder_layers_17_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[910] gv1555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc919: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1555, R.dtype("float16")) _917: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight3, alloc918, model_decoder_layers_17_encoder_attn_q_proj_bias3, alloc919) R.vm.kill_object(alloc918) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias3) gv1556: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape886: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc919, gv1556, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc919) gv1557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape887: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape886, gv1557, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape886) gv1558: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc920: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1558, R.dtype("float16")) _918: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape887, alloc920) R.vm.kill_object(reshape887) gv1559: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape888: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc920, gv1559, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc920) gv1560: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape889: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape888, gv1560, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape888) model_decoder_layers_17_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] model_decoder_layers_17_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[912] gv1561: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc921: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1561, R.dtype("float16")) _919: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight3, reshape889, model_decoder_layers_17_encoder_attn_out_proj_bias3, alloc921) R.vm.kill_object(reshape889) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias3) gv1562: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc922: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1562, R.dtype("float16")) cls.add(alloc917, alloc921, alloc922) R.vm.kill_object(alloc917) R.vm.kill_object(alloc921) model_decoder_layers_17_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[919] model_decoder_layers_17_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[920] gv1563: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc923: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1563, R.dtype("float16")) cls.layer_norm(alloc922, model_decoder_layers_17_final_layer_norm_weight3, model_decoder_layers_17_final_layer_norm_bias3, alloc923) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias3) model_decoder_layers_17_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] model_decoder_layers_17_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[916] gv1564: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc924: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1564, R.dtype("float16")) _922: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_17_fc1_weight3, alloc923, model_decoder_layers_17_fc1_bias3, alloc924) R.vm.kill_object(alloc923) R.vm.kill_object(model_decoder_layers_17_fc1_weight3) R.vm.kill_object(model_decoder_layers_17_fc1_bias3) model_decoder_layers_17_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] model_decoder_layers_17_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[918] gv1565: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc925: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1565, R.dtype("float16")) _923: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_17_fc2_weight3, alloc924, model_decoder_layers_17_fc2_bias3, alloc925) R.vm.kill_object(alloc924) R.vm.kill_object(model_decoder_layers_17_fc2_weight3) R.vm.kill_object(model_decoder_layers_17_fc2_bias3) gv1566: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc926: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1566, R.dtype("float16")) cls.add(alloc922, alloc925, alloc926) R.vm.kill_object(alloc922) R.vm.kill_object(alloc925) model_decoder_layers_18_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[928] model_decoder_layers_18_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[929] gv1567: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc927: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1567, R.dtype("float16")) cls.layer_norm(alloc926, model_decoder_layers_18_self_attn_layer_norm_weight3, model_decoder_layers_18_self_attn_layer_norm_bias3, alloc927) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias3) model_decoder_layers_18_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] model_decoder_layers_18_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[925] gv1568: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc928: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1568, R.dtype("float16")) _926: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_q_proj_weight3, alloc927, model_decoder_layers_18_self_attn_q_proj_bias3, alloc928) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias3) gv1569: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape890: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc928, gv1569, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc928) model_decoder_layers_18_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] gv1570: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc929: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1570, R.dtype("float16")) _927: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_18_self_attn_k_proj_weight3, alloc927, alloc929) R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight3) gv1571: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape891: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc929, gv1571, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc929) model_decoder_layers_18_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] model_decoder_layers_18_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[923] gv1572: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc930: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1572, R.dtype("float16")) _928: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_v_proj_weight3, alloc927, model_decoder_layers_18_self_attn_v_proj_bias3, alloc930) R.vm.kill_object(alloc927) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias3) gv1573: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape892: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc930, gv1573, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc930) gv1574: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc931: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1574, R.dtype("float16")) cls.concatenate(reshape890, reshape891, reshape892, alloc931) R.vm.kill_object(reshape890) R.vm.kill_object(reshape891) R.vm.kill_object(reshape892) gv1575: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape893: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc931, gv1575, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc931) gv1576: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc932: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1576, R.dtype("float16")) _930: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape893, alloc932) R.vm.kill_object(reshape893) gv1577: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape894: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc932, gv1577, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc932) gv1578: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape895: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape894, gv1578, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape894) model_decoder_layers_18_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] model_decoder_layers_18_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[927] gv1579: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc933: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1579, R.dtype("float16")) _931: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_out_proj_weight3, reshape895, model_decoder_layers_18_self_attn_out_proj_bias3, alloc933) R.vm.kill_object(reshape895) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias3) gv1580: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc934: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1580, R.dtype("float16")) cls.add(alloc926, alloc933, alloc934) R.vm.kill_object(alloc926) R.vm.kill_object(alloc933) model_decoder_layers_18_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[937] model_decoder_layers_18_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[938] gv1581: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc935: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1581, R.dtype("float16")) cls.layer_norm(alloc934, model_decoder_layers_18_encoder_attn_layer_norm_weight3, model_decoder_layers_18_encoder_attn_layer_norm_bias3, alloc935) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias3) model_decoder_layers_18_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] model_decoder_layers_18_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[934] gv1582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc936: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1582, R.dtype("float16")) _934: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight3, alloc935, model_decoder_layers_18_encoder_attn_q_proj_bias3, alloc936) R.vm.kill_object(alloc935) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias3) gv1583: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape896: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc936, gv1583, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc936) gv1584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape897: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape896, gv1584, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape896) gv1585: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc937: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1585, R.dtype("float16")) _935: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape897, alloc937) R.vm.kill_object(reshape897) gv1586: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape898: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc937, gv1586, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc937) gv1587: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape899: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape898, gv1587, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape898) model_decoder_layers_18_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] model_decoder_layers_18_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[936] gv1588: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc938: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1588, R.dtype("float16")) _936: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight3, reshape899, model_decoder_layers_18_encoder_attn_out_proj_bias3, alloc938) R.vm.kill_object(reshape899) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias3) gv1589: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc939: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1589, R.dtype("float16")) cls.add(alloc934, alloc938, alloc939) R.vm.kill_object(alloc934) R.vm.kill_object(alloc938) model_decoder_layers_18_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[943] model_decoder_layers_18_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[944] gv1590: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc940: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1590, R.dtype("float16")) cls.layer_norm(alloc939, model_decoder_layers_18_final_layer_norm_weight3, model_decoder_layers_18_final_layer_norm_bias3, alloc940) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias3) model_decoder_layers_18_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] model_decoder_layers_18_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[940] gv1591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc941: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1591, R.dtype("float16")) _939: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_18_fc1_weight3, alloc940, model_decoder_layers_18_fc1_bias3, alloc941) R.vm.kill_object(alloc940) R.vm.kill_object(model_decoder_layers_18_fc1_weight3) R.vm.kill_object(model_decoder_layers_18_fc1_bias3) model_decoder_layers_18_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] model_decoder_layers_18_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[942] gv1592: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc942: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1592, R.dtype("float16")) _940: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_18_fc2_weight3, alloc941, model_decoder_layers_18_fc2_bias3, alloc942) R.vm.kill_object(alloc941) R.vm.kill_object(model_decoder_layers_18_fc2_weight3) R.vm.kill_object(model_decoder_layers_18_fc2_bias3) gv1593: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc943: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1593, R.dtype("float16")) cls.add(alloc939, alloc942, alloc943) R.vm.kill_object(alloc939) R.vm.kill_object(alloc942) model_decoder_layers_19_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[952] model_decoder_layers_19_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[953] gv1594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc944: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1594, R.dtype("float16")) cls.layer_norm(alloc943, model_decoder_layers_19_self_attn_layer_norm_weight3, model_decoder_layers_19_self_attn_layer_norm_bias3, alloc944) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias3) model_decoder_layers_19_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] model_decoder_layers_19_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[949] gv1595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc945: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1595, R.dtype("float16")) _943: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_q_proj_weight3, alloc944, model_decoder_layers_19_self_attn_q_proj_bias3, alloc945) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias3) gv1596: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape900: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc945, gv1596, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc945) model_decoder_layers_19_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] gv1597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc946: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1597, R.dtype("float16")) _944: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_19_self_attn_k_proj_weight3, alloc944, alloc946) R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight3) gv1598: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape901: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc946, gv1598, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc946) model_decoder_layers_19_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] model_decoder_layers_19_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[947] gv1599: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc947: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1599, R.dtype("float16")) _945: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_v_proj_weight3, alloc944, model_decoder_layers_19_self_attn_v_proj_bias3, alloc947) R.vm.kill_object(alloc944) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias3) gv1600: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape902: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc947, gv1600, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc947) gv1601: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc948: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1601, R.dtype("float16")) cls.concatenate(reshape900, reshape901, reshape902, alloc948) R.vm.kill_object(reshape900) R.vm.kill_object(reshape901) R.vm.kill_object(reshape902) gv1602: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape903: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc948, gv1602, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc948) gv1603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc949: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1603, R.dtype("float16")) _947: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape903, alloc949) R.vm.kill_object(reshape903) gv1604: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape904: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc949, gv1604, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc949) gv1605: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape905: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape904, gv1605, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape904) model_decoder_layers_19_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] model_decoder_layers_19_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[951] gv1606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc950: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1606, R.dtype("float16")) _948: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_out_proj_weight3, reshape905, model_decoder_layers_19_self_attn_out_proj_bias3, alloc950) R.vm.kill_object(reshape905) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias3) gv1607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc951: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1607, R.dtype("float16")) cls.add(alloc943, alloc950, alloc951) R.vm.kill_object(alloc943) R.vm.kill_object(alloc950) model_decoder_layers_19_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[961] model_decoder_layers_19_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[962] gv1608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc952: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1608, R.dtype("float16")) cls.layer_norm(alloc951, model_decoder_layers_19_encoder_attn_layer_norm_weight3, model_decoder_layers_19_encoder_attn_layer_norm_bias3, alloc952) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias3) model_decoder_layers_19_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] model_decoder_layers_19_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[958] gv1609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc953: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1609, R.dtype("float16")) _951: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight3, alloc952, model_decoder_layers_19_encoder_attn_q_proj_bias3, alloc953) R.vm.kill_object(alloc952) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias3) gv1610: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape906: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc953, gv1610, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc953) gv1611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape907: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape906, gv1611, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape906) gv1612: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc954: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1612, R.dtype("float16")) _952: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape907, alloc954) R.vm.kill_object(reshape907) gv1613: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape908: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc954, gv1613, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc954) gv1614: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape909: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape908, gv1614, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape908) model_decoder_layers_19_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] model_decoder_layers_19_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[960] gv1615: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc955: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1615, R.dtype("float16")) _953: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight3, reshape909, model_decoder_layers_19_encoder_attn_out_proj_bias3, alloc955) R.vm.kill_object(reshape909) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias3) gv1616: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc956: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1616, R.dtype("float16")) cls.add(alloc951, alloc955, alloc956) R.vm.kill_object(alloc951) R.vm.kill_object(alloc955) model_decoder_layers_19_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[967] model_decoder_layers_19_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[968] gv1617: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc957: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1617, R.dtype("float16")) cls.layer_norm(alloc956, model_decoder_layers_19_final_layer_norm_weight3, model_decoder_layers_19_final_layer_norm_bias3, alloc957) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias3) model_decoder_layers_19_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] model_decoder_layers_19_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[964] gv1618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc958: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1618, R.dtype("float16")) _956: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_19_fc1_weight3, alloc957, model_decoder_layers_19_fc1_bias3, alloc958) R.vm.kill_object(alloc957) R.vm.kill_object(model_decoder_layers_19_fc1_weight3) R.vm.kill_object(model_decoder_layers_19_fc1_bias3) model_decoder_layers_19_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] model_decoder_layers_19_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[966] gv1619: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc959: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1619, R.dtype("float16")) _957: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_19_fc2_weight3, alloc958, model_decoder_layers_19_fc2_bias3, alloc959) R.vm.kill_object(alloc958) R.vm.kill_object(model_decoder_layers_19_fc2_weight3) R.vm.kill_object(model_decoder_layers_19_fc2_bias3) gv1620: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc960: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1620, R.dtype("float16")) cls.add(alloc956, alloc959, alloc960) R.vm.kill_object(alloc956) R.vm.kill_object(alloc959) model_decoder_layers_20_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[976] model_decoder_layers_20_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[977] gv1621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc961: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1621, R.dtype("float16")) cls.layer_norm(alloc960, model_decoder_layers_20_self_attn_layer_norm_weight3, model_decoder_layers_20_self_attn_layer_norm_bias3, alloc961) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias3) model_decoder_layers_20_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] model_decoder_layers_20_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[973] gv1622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc962: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1622, R.dtype("float16")) _960: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_q_proj_weight3, alloc961, model_decoder_layers_20_self_attn_q_proj_bias3, alloc962) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias3) gv1623: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape910: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc962, gv1623, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc962) model_decoder_layers_20_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] gv1624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc963: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1624, R.dtype("float16")) _961: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_20_self_attn_k_proj_weight3, alloc961, alloc963) R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight3) gv1625: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape911: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc963, gv1625, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc963) model_decoder_layers_20_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] model_decoder_layers_20_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[971] gv1626: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc964: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1626, R.dtype("float16")) _962: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_v_proj_weight3, alloc961, model_decoder_layers_20_self_attn_v_proj_bias3, alloc964) R.vm.kill_object(alloc961) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias3) gv1627: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape912: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc964, gv1627, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc964) gv1628: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc965: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1628, R.dtype("float16")) cls.concatenate(reshape910, reshape911, reshape912, alloc965) R.vm.kill_object(reshape910) R.vm.kill_object(reshape911) R.vm.kill_object(reshape912) gv1629: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape913: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc965, gv1629, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc965) gv1630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc966: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1630, R.dtype("float16")) _964: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape913, alloc966) R.vm.kill_object(reshape913) gv1631: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape914: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc966, gv1631, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc966) gv1632: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape915: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape914, gv1632, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape914) model_decoder_layers_20_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] model_decoder_layers_20_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[975] gv1633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc967: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1633, R.dtype("float16")) _965: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_out_proj_weight3, reshape915, model_decoder_layers_20_self_attn_out_proj_bias3, alloc967) R.vm.kill_object(reshape915) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias3) gv1634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc968: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1634, R.dtype("float16")) cls.add(alloc960, alloc967, alloc968) R.vm.kill_object(alloc960) R.vm.kill_object(alloc967) model_decoder_layers_20_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[985] model_decoder_layers_20_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[986] gv1635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc969: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1635, R.dtype("float16")) cls.layer_norm(alloc968, model_decoder_layers_20_encoder_attn_layer_norm_weight3, model_decoder_layers_20_encoder_attn_layer_norm_bias3, alloc969) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias3) model_decoder_layers_20_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] model_decoder_layers_20_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[982] gv1636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc970: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1636, R.dtype("float16")) _968: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight3, alloc969, model_decoder_layers_20_encoder_attn_q_proj_bias3, alloc970) R.vm.kill_object(alloc969) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias3) gv1637: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape916: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc970, gv1637, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc970) gv1638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape917: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape916, gv1638, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape916) gv1639: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc971: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1639, R.dtype("float16")) _969: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape917, alloc971) R.vm.kill_object(reshape917) gv1640: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape918: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc971, gv1640, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc971) gv1641: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape919: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape918, gv1641, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape918) model_decoder_layers_20_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] model_decoder_layers_20_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[984] gv1642: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc972: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1642, R.dtype("float16")) _970: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight3, reshape919, model_decoder_layers_20_encoder_attn_out_proj_bias3, alloc972) R.vm.kill_object(reshape919) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias3) gv1643: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc973: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1643, R.dtype("float16")) cls.add(alloc968, alloc972, alloc973) R.vm.kill_object(alloc968) R.vm.kill_object(alloc972) model_decoder_layers_20_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[991] model_decoder_layers_20_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[992] gv1644: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc974: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1644, R.dtype("float16")) cls.layer_norm(alloc973, model_decoder_layers_20_final_layer_norm_weight3, model_decoder_layers_20_final_layer_norm_bias3, alloc974) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias3) model_decoder_layers_20_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] model_decoder_layers_20_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[988] gv1645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc975: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1645, R.dtype("float16")) _973: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_20_fc1_weight3, alloc974, model_decoder_layers_20_fc1_bias3, alloc975) R.vm.kill_object(alloc974) R.vm.kill_object(model_decoder_layers_20_fc1_weight3) R.vm.kill_object(model_decoder_layers_20_fc1_bias3) model_decoder_layers_20_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] model_decoder_layers_20_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[990] gv1646: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc976: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1646, R.dtype("float16")) _974: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_20_fc2_weight3, alloc975, model_decoder_layers_20_fc2_bias3, alloc976) R.vm.kill_object(alloc975) R.vm.kill_object(model_decoder_layers_20_fc2_weight3) R.vm.kill_object(model_decoder_layers_20_fc2_bias3) gv1647: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc977: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1647, R.dtype("float16")) cls.add(alloc973, alloc976, alloc977) R.vm.kill_object(alloc973) R.vm.kill_object(alloc976) model_decoder_layers_21_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1000] model_decoder_layers_21_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1001] gv1648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc978: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1648, R.dtype("float16")) cls.layer_norm(alloc977, model_decoder_layers_21_self_attn_layer_norm_weight3, model_decoder_layers_21_self_attn_layer_norm_bias3, alloc978) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias3) model_decoder_layers_21_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] model_decoder_layers_21_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[997] gv1649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc979: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1649, R.dtype("float16")) _977: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_q_proj_weight3, alloc978, model_decoder_layers_21_self_attn_q_proj_bias3, alloc979) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias3) gv1650: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape920: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc979, gv1650, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc979) model_decoder_layers_21_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] gv1651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc980: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1651, R.dtype("float16")) _978: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_21_self_attn_k_proj_weight3, alloc978, alloc980) R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight3) gv1652: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape921: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc980, gv1652, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc980) model_decoder_layers_21_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] model_decoder_layers_21_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[995] gv1653: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc981: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1653, R.dtype("float16")) _979: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_v_proj_weight3, alloc978, model_decoder_layers_21_self_attn_v_proj_bias3, alloc981) R.vm.kill_object(alloc978) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias3) gv1654: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape922: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc981, gv1654, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc981) gv1655: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc982: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1655, R.dtype("float16")) cls.concatenate(reshape920, reshape921, reshape922, alloc982) R.vm.kill_object(reshape920) R.vm.kill_object(reshape921) R.vm.kill_object(reshape922) gv1656: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape923: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc982, gv1656, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc982) gv1657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc983: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1657, R.dtype("float16")) _981: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape923, alloc983) R.vm.kill_object(reshape923) gv1658: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape924: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc983, gv1658, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc983) gv1659: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape925: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape924, gv1659, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape924) model_decoder_layers_21_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] model_decoder_layers_21_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[999] gv1660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc984: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1660, R.dtype("float16")) _982: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_out_proj_weight3, reshape925, model_decoder_layers_21_self_attn_out_proj_bias3, alloc984) R.vm.kill_object(reshape925) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias3) gv1661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc985: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1661, R.dtype("float16")) cls.add(alloc977, alloc984, alloc985) R.vm.kill_object(alloc977) R.vm.kill_object(alloc984) model_decoder_layers_21_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1009] model_decoder_layers_21_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1010] gv1662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc986: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1662, R.dtype("float16")) cls.layer_norm(alloc985, model_decoder_layers_21_encoder_attn_layer_norm_weight3, model_decoder_layers_21_encoder_attn_layer_norm_bias3, alloc986) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias3) model_decoder_layers_21_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] model_decoder_layers_21_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1006] gv1663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc987: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1663, R.dtype("float16")) _985: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight3, alloc986, model_decoder_layers_21_encoder_attn_q_proj_bias3, alloc987) R.vm.kill_object(alloc986) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias3) gv1664: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape926: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc987, gv1664, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc987) gv1665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape927: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape926, gv1665, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape926) gv1666: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc988: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1666, R.dtype("float16")) _986: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape927, alloc988) R.vm.kill_object(reshape927) gv1667: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape928: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc988, gv1667, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc988) gv1668: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape929: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape928, gv1668, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape928) model_decoder_layers_21_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] model_decoder_layers_21_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1008] gv1669: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc989: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1669, R.dtype("float16")) _987: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight3, reshape929, model_decoder_layers_21_encoder_attn_out_proj_bias3, alloc989) R.vm.kill_object(reshape929) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias3) gv1670: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc990: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1670, R.dtype("float16")) cls.add(alloc985, alloc989, alloc990) R.vm.kill_object(alloc985) R.vm.kill_object(alloc989) model_decoder_layers_21_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1015] model_decoder_layers_21_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1016] gv1671: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc991: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1671, R.dtype("float16")) cls.layer_norm(alloc990, model_decoder_layers_21_final_layer_norm_weight3, model_decoder_layers_21_final_layer_norm_bias3, alloc991) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias3) model_decoder_layers_21_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] model_decoder_layers_21_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1012] gv1672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc992: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1672, R.dtype("float16")) _990: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_21_fc1_weight3, alloc991, model_decoder_layers_21_fc1_bias3, alloc992) R.vm.kill_object(alloc991) R.vm.kill_object(model_decoder_layers_21_fc1_weight3) R.vm.kill_object(model_decoder_layers_21_fc1_bias3) model_decoder_layers_21_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] model_decoder_layers_21_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1014] gv1673: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc993: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1673, R.dtype("float16")) _991: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_21_fc2_weight3, alloc992, model_decoder_layers_21_fc2_bias3, alloc993) R.vm.kill_object(alloc992) R.vm.kill_object(model_decoder_layers_21_fc2_weight3) R.vm.kill_object(model_decoder_layers_21_fc2_bias3) gv1674: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc994: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1674, R.dtype("float16")) cls.add(alloc990, alloc993, alloc994) R.vm.kill_object(alloc990) R.vm.kill_object(alloc993) model_decoder_layers_22_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1024] model_decoder_layers_22_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1025] gv1675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc995: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1675, R.dtype("float16")) cls.layer_norm(alloc994, model_decoder_layers_22_self_attn_layer_norm_weight3, model_decoder_layers_22_self_attn_layer_norm_bias3, alloc995) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias3) model_decoder_layers_22_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] model_decoder_layers_22_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1021] gv1676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc996: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1676, R.dtype("float16")) _994: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_q_proj_weight3, alloc995, model_decoder_layers_22_self_attn_q_proj_bias3, alloc996) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias3) gv1677: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape930: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc996, gv1677, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc996) model_decoder_layers_22_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] gv1678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc997: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1678, R.dtype("float16")) _995: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_22_self_attn_k_proj_weight3, alloc995, alloc997) R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight3) gv1679: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape931: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc997, gv1679, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc997) model_decoder_layers_22_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] model_decoder_layers_22_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1019] gv1680: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc998: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1680, R.dtype("float16")) _996: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_v_proj_weight3, alloc995, model_decoder_layers_22_self_attn_v_proj_bias3, alloc998) R.vm.kill_object(alloc995) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias3) gv1681: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape932: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc998, gv1681, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc998) gv1682: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc999: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1682, R.dtype("float16")) cls.concatenate(reshape930, reshape931, reshape932, alloc999) R.vm.kill_object(reshape930) R.vm.kill_object(reshape931) R.vm.kill_object(reshape932) gv1683: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape933: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc999, gv1683, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc999) gv1684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1000: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1684, R.dtype("float16")) _998: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape933, alloc1000) R.vm.kill_object(reshape933) gv1685: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape934: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1000, gv1685, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1000) gv1686: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape935: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape934, gv1686, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape934) model_decoder_layers_22_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] model_decoder_layers_22_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1023] gv1687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1001: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1687, R.dtype("float16")) _999: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_out_proj_weight3, reshape935, model_decoder_layers_22_self_attn_out_proj_bias3, alloc1001) R.vm.kill_object(reshape935) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias3) gv1688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1002: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1688, R.dtype("float16")) cls.add(alloc994, alloc1001, alloc1002) R.vm.kill_object(alloc994) R.vm.kill_object(alloc1001) model_decoder_layers_22_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1033] model_decoder_layers_22_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1034] gv1689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1003: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1689, R.dtype("float16")) cls.layer_norm(alloc1002, model_decoder_layers_22_encoder_attn_layer_norm_weight3, model_decoder_layers_22_encoder_attn_layer_norm_bias3, alloc1003) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias3) model_decoder_layers_22_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] model_decoder_layers_22_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1030] gv1690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1004: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1690, R.dtype("float16")) _1002: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight3, alloc1003, model_decoder_layers_22_encoder_attn_q_proj_bias3, alloc1004) R.vm.kill_object(alloc1003) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias3) gv1691: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape936: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1004, gv1691, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1004) gv1692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape937: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape936, gv1692, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape936) gv1693: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1005: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1693, R.dtype("float16")) _1003: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape937, alloc1005) R.vm.kill_object(reshape937) gv1694: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape938: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1005, gv1694, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1005) gv1695: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape939: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape938, gv1695, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape938) model_decoder_layers_22_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] model_decoder_layers_22_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1032] gv1696: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1006: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1696, R.dtype("float16")) _1004: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight3, reshape939, model_decoder_layers_22_encoder_attn_out_proj_bias3, alloc1006) R.vm.kill_object(reshape939) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias3) gv1697: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1007: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1697, R.dtype("float16")) cls.add(alloc1002, alloc1006, alloc1007) R.vm.kill_object(alloc1002) R.vm.kill_object(alloc1006) model_decoder_layers_22_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1039] model_decoder_layers_22_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1040] gv1698: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1008: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1698, R.dtype("float16")) cls.layer_norm(alloc1007, model_decoder_layers_22_final_layer_norm_weight3, model_decoder_layers_22_final_layer_norm_bias3, alloc1008) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias3) model_decoder_layers_22_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] model_decoder_layers_22_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1036] gv1699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1009: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1699, R.dtype("float16")) _1007: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_22_fc1_weight3, alloc1008, model_decoder_layers_22_fc1_bias3, alloc1009) R.vm.kill_object(alloc1008) R.vm.kill_object(model_decoder_layers_22_fc1_weight3) R.vm.kill_object(model_decoder_layers_22_fc1_bias3) model_decoder_layers_22_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] model_decoder_layers_22_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1038] gv1700: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1010: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1700, R.dtype("float16")) _1008: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_22_fc2_weight3, alloc1009, model_decoder_layers_22_fc2_bias3, alloc1010) R.vm.kill_object(alloc1009) R.vm.kill_object(model_decoder_layers_22_fc2_weight3) R.vm.kill_object(model_decoder_layers_22_fc2_bias3) gv1701: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1011: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1701, R.dtype("float16")) cls.add(alloc1007, alloc1010, alloc1011) R.vm.kill_object(alloc1007) R.vm.kill_object(alloc1010) model_decoder_layers_23_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1048] model_decoder_layers_23_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1049] gv1702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1012: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1702, R.dtype("float16")) cls.layer_norm(alloc1011, model_decoder_layers_23_self_attn_layer_norm_weight3, model_decoder_layers_23_self_attn_layer_norm_bias3, alloc1012) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias3) model_decoder_layers_23_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] model_decoder_layers_23_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1045] gv1703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1013: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1703, R.dtype("float16")) _1011: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_q_proj_weight3, alloc1012, model_decoder_layers_23_self_attn_q_proj_bias3, alloc1013) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias3) gv1704: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape940: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1013, gv1704, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1013) model_decoder_layers_23_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] gv1705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1014: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1705, R.dtype("float16")) _1012: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_23_self_attn_k_proj_weight3, alloc1012, alloc1014) R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight3) gv1706: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape941: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1014, gv1706, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1014) model_decoder_layers_23_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] model_decoder_layers_23_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1043] gv1707: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1015: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1707, R.dtype("float16")) _1013: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_v_proj_weight3, alloc1012, model_decoder_layers_23_self_attn_v_proj_bias3, alloc1015) R.vm.kill_object(alloc1012) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias3) gv1708: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape942: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1015, gv1708, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1015) gv1709: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1016: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1709, R.dtype("float16")) cls.concatenate(reshape940, reshape941, reshape942, alloc1016) R.vm.kill_object(reshape940) R.vm.kill_object(reshape941) R.vm.kill_object(reshape942) gv1710: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape943: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1016, gv1710, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1016) gv1711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1017: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1711, R.dtype("float16")) _1015: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape943, alloc1017) R.vm.kill_object(reshape943) gv1712: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape944: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1017, gv1712, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1017) gv1713: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape945: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape944, gv1713, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape944) model_decoder_layers_23_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] model_decoder_layers_23_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1047] gv1714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1018: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1714, R.dtype("float16")) _1016: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_out_proj_weight3, reshape945, model_decoder_layers_23_self_attn_out_proj_bias3, alloc1018) R.vm.kill_object(reshape945) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias3) gv1715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1019: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1715, R.dtype("float16")) cls.add(alloc1011, alloc1018, alloc1019) R.vm.kill_object(alloc1011) R.vm.kill_object(alloc1018) model_decoder_layers_23_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1057] model_decoder_layers_23_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1058] gv1716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1020: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1716, R.dtype("float16")) cls.layer_norm(alloc1019, model_decoder_layers_23_encoder_attn_layer_norm_weight3, model_decoder_layers_23_encoder_attn_layer_norm_bias3, alloc1020) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias3) model_decoder_layers_23_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] model_decoder_layers_23_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1054] gv1717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1021: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1717, R.dtype("float16")) _1019: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight3, alloc1020, model_decoder_layers_23_encoder_attn_q_proj_bias3, alloc1021) R.vm.kill_object(alloc1020) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias3) gv1718: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape946: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1021, gv1718, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1021) gv1719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape947: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape946, gv1719, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape946) gv1720: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1022: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1720, R.dtype("float16")) _1020: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape947, alloc1022) R.vm.kill_object(reshape947) gv1721: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape948: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1022, gv1721, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1022) gv1722: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape949: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape948, gv1722, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape948) model_decoder_layers_23_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] model_decoder_layers_23_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1056] gv1723: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1023: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1723, R.dtype("float16")) _1021: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight3, reshape949, model_decoder_layers_23_encoder_attn_out_proj_bias3, alloc1023) R.vm.kill_object(reshape949) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias3) gv1724: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1024: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1724, R.dtype("float16")) cls.add(alloc1019, alloc1023, alloc1024) R.vm.kill_object(alloc1019) R.vm.kill_object(alloc1023) model_decoder_layers_23_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1063] model_decoder_layers_23_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1064] gv1725: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1025: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1725, R.dtype("float16")) cls.layer_norm(alloc1024, model_decoder_layers_23_final_layer_norm_weight3, model_decoder_layers_23_final_layer_norm_bias3, alloc1025) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias3) model_decoder_layers_23_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] model_decoder_layers_23_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1060] gv1726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1026: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1726, R.dtype("float16")) _1024: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_23_fc1_weight3, alloc1025, model_decoder_layers_23_fc1_bias3, alloc1026) R.vm.kill_object(alloc1025) R.vm.kill_object(model_decoder_layers_23_fc1_weight3) R.vm.kill_object(model_decoder_layers_23_fc1_bias3) model_decoder_layers_23_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] model_decoder_layers_23_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1062] gv1727: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1027: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1727, R.dtype("float16")) _1025: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_23_fc2_weight3, alloc1026, model_decoder_layers_23_fc2_bias3, alloc1027) R.vm.kill_object(alloc1026) R.vm.kill_object(model_decoder_layers_23_fc2_weight3) R.vm.kill_object(model_decoder_layers_23_fc2_bias3) gv1728: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1028: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1728, R.dtype("float16")) cls.add(alloc1024, alloc1027, alloc1028) R.vm.kill_object(alloc1024) R.vm.kill_object(alloc1027) model_decoder_layers_24_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1072] model_decoder_layers_24_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1073] gv1729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1029: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1729, R.dtype("float16")) cls.layer_norm(alloc1028, model_decoder_layers_24_self_attn_layer_norm_weight3, model_decoder_layers_24_self_attn_layer_norm_bias3, alloc1029) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias3) model_decoder_layers_24_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] model_decoder_layers_24_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1069] gv1730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1030: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1730, R.dtype("float16")) _1028: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_q_proj_weight3, alloc1029, model_decoder_layers_24_self_attn_q_proj_bias3, alloc1030) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias3) gv1731: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape950: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1030, gv1731, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1030) model_decoder_layers_24_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] gv1732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1031: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1732, R.dtype("float16")) _1029: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_24_self_attn_k_proj_weight3, alloc1029, alloc1031) R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight3) gv1733: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape951: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1031, gv1733, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1031) model_decoder_layers_24_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] model_decoder_layers_24_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1067] gv1734: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1032: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1734, R.dtype("float16")) _1030: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_v_proj_weight3, alloc1029, model_decoder_layers_24_self_attn_v_proj_bias3, alloc1032) R.vm.kill_object(alloc1029) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias3) gv1735: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape952: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1032, gv1735, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1032) gv1736: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1033: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1736, R.dtype("float16")) cls.concatenate(reshape950, reshape951, reshape952, alloc1033) R.vm.kill_object(reshape950) R.vm.kill_object(reshape951) R.vm.kill_object(reshape952) gv1737: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape953: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1033, gv1737, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1033) gv1738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1034: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1738, R.dtype("float16")) _1032: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape953, alloc1034) R.vm.kill_object(reshape953) gv1739: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape954: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1034, gv1739, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1034) gv1740: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape955: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape954, gv1740, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape954) model_decoder_layers_24_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] model_decoder_layers_24_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1071] gv1741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1035: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1741, R.dtype("float16")) _1033: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_out_proj_weight3, reshape955, model_decoder_layers_24_self_attn_out_proj_bias3, alloc1035) R.vm.kill_object(reshape955) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias3) gv1742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1036: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1742, R.dtype("float16")) cls.add(alloc1028, alloc1035, alloc1036) R.vm.kill_object(alloc1028) R.vm.kill_object(alloc1035) model_decoder_layers_24_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1081] model_decoder_layers_24_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1082] gv1743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1037: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1743, R.dtype("float16")) cls.layer_norm(alloc1036, model_decoder_layers_24_encoder_attn_layer_norm_weight3, model_decoder_layers_24_encoder_attn_layer_norm_bias3, alloc1037) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias3) model_decoder_layers_24_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] model_decoder_layers_24_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1078] gv1744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1038: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1744, R.dtype("float16")) _1036: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight3, alloc1037, model_decoder_layers_24_encoder_attn_q_proj_bias3, alloc1038) R.vm.kill_object(alloc1037) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias3) gv1745: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape956: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1038, gv1745, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1038) gv1746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape957: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape956, gv1746, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape956) gv1747: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1039: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1747, R.dtype("float16")) _1037: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape957, alloc1039) R.vm.kill_object(reshape957) gv1748: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape958: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1039, gv1748, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1039) gv1749: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape959: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape958, gv1749, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape958) model_decoder_layers_24_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] model_decoder_layers_24_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1080] gv1750: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1040: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1750, R.dtype("float16")) _1038: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight3, reshape959, model_decoder_layers_24_encoder_attn_out_proj_bias3, alloc1040) R.vm.kill_object(reshape959) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias3) gv1751: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1041: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1751, R.dtype("float16")) cls.add(alloc1036, alloc1040, alloc1041) R.vm.kill_object(alloc1036) R.vm.kill_object(alloc1040) model_decoder_layers_24_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1087] model_decoder_layers_24_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1088] gv1752: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1042: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1752, R.dtype("float16")) cls.layer_norm(alloc1041, model_decoder_layers_24_final_layer_norm_weight3, model_decoder_layers_24_final_layer_norm_bias3, alloc1042) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias3) model_decoder_layers_24_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] model_decoder_layers_24_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1084] gv1753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1043: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1753, R.dtype("float16")) _1041: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_24_fc1_weight3, alloc1042, model_decoder_layers_24_fc1_bias3, alloc1043) R.vm.kill_object(alloc1042) R.vm.kill_object(model_decoder_layers_24_fc1_weight3) R.vm.kill_object(model_decoder_layers_24_fc1_bias3) model_decoder_layers_24_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] model_decoder_layers_24_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1086] gv1754: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1044: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1754, R.dtype("float16")) _1042: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_24_fc2_weight3, alloc1043, model_decoder_layers_24_fc2_bias3, alloc1044) R.vm.kill_object(alloc1043) R.vm.kill_object(model_decoder_layers_24_fc2_weight3) R.vm.kill_object(model_decoder_layers_24_fc2_bias3) gv1755: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1045: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1755, R.dtype("float16")) cls.add(alloc1041, alloc1044, alloc1045) R.vm.kill_object(alloc1041) R.vm.kill_object(alloc1044) model_decoder_layers_25_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1096] model_decoder_layers_25_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1097] gv1756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1046: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1756, R.dtype("float16")) cls.layer_norm(alloc1045, model_decoder_layers_25_self_attn_layer_norm_weight3, model_decoder_layers_25_self_attn_layer_norm_bias3, alloc1046) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias3) model_decoder_layers_25_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] model_decoder_layers_25_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1093] gv1757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1047: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1757, R.dtype("float16")) _1045: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_q_proj_weight3, alloc1046, model_decoder_layers_25_self_attn_q_proj_bias3, alloc1047) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias3) gv1758: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape960: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1047, gv1758, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1047) model_decoder_layers_25_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] gv1759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1048: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1759, R.dtype("float16")) _1046: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_25_self_attn_k_proj_weight3, alloc1046, alloc1048) R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight3) gv1760: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape961: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1048, gv1760, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1048) model_decoder_layers_25_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] model_decoder_layers_25_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1091] gv1761: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1049: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1761, R.dtype("float16")) _1047: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_v_proj_weight3, alloc1046, model_decoder_layers_25_self_attn_v_proj_bias3, alloc1049) R.vm.kill_object(alloc1046) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias3) gv1762: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape962: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1049, gv1762, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1049) gv1763: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1050: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1763, R.dtype("float16")) cls.concatenate(reshape960, reshape961, reshape962, alloc1050) R.vm.kill_object(reshape960) R.vm.kill_object(reshape961) R.vm.kill_object(reshape962) gv1764: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape963: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1050, gv1764, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1050) gv1765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1051: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1765, R.dtype("float16")) _1049: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape963, alloc1051) R.vm.kill_object(reshape963) gv1766: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape964: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1051, gv1766, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1051) gv1767: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape965: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape964, gv1767, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape964) model_decoder_layers_25_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] model_decoder_layers_25_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1095] gv1768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1052: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1768, R.dtype("float16")) _1050: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_out_proj_weight3, reshape965, model_decoder_layers_25_self_attn_out_proj_bias3, alloc1052) R.vm.kill_object(reshape965) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias3) gv1769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1053: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1769, R.dtype("float16")) cls.add(alloc1045, alloc1052, alloc1053) R.vm.kill_object(alloc1045) R.vm.kill_object(alloc1052) model_decoder_layers_25_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1105] model_decoder_layers_25_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1106] gv1770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1054: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1770, R.dtype("float16")) cls.layer_norm(alloc1053, model_decoder_layers_25_encoder_attn_layer_norm_weight3, model_decoder_layers_25_encoder_attn_layer_norm_bias3, alloc1054) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias3) model_decoder_layers_25_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] model_decoder_layers_25_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1102] gv1771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1055: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1771, R.dtype("float16")) _1053: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight3, alloc1054, model_decoder_layers_25_encoder_attn_q_proj_bias3, alloc1055) R.vm.kill_object(alloc1054) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias3) gv1772: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape966: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1055, gv1772, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1055) gv1773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape967: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape966, gv1773, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape966) gv1774: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1056: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1774, R.dtype("float16")) _1054: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape967, alloc1056) R.vm.kill_object(reshape967) gv1775: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape968: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1056, gv1775, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1056) gv1776: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape969: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape968, gv1776, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape968) model_decoder_layers_25_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] model_decoder_layers_25_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1104] gv1777: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1057: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1777, R.dtype("float16")) _1055: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight3, reshape969, model_decoder_layers_25_encoder_attn_out_proj_bias3, alloc1057) R.vm.kill_object(reshape969) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias3) gv1778: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1058: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1778, R.dtype("float16")) cls.add(alloc1053, alloc1057, alloc1058) R.vm.kill_object(alloc1053) R.vm.kill_object(alloc1057) model_decoder_layers_25_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1111] model_decoder_layers_25_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1112] gv1779: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1059: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1779, R.dtype("float16")) cls.layer_norm(alloc1058, model_decoder_layers_25_final_layer_norm_weight3, model_decoder_layers_25_final_layer_norm_bias3, alloc1059) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias3) model_decoder_layers_25_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] model_decoder_layers_25_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1108] gv1780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1060: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1780, R.dtype("float16")) _1058: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_25_fc1_weight3, alloc1059, model_decoder_layers_25_fc1_bias3, alloc1060) R.vm.kill_object(alloc1059) R.vm.kill_object(model_decoder_layers_25_fc1_weight3) R.vm.kill_object(model_decoder_layers_25_fc1_bias3) model_decoder_layers_25_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] model_decoder_layers_25_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1110] gv1781: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1061: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1781, R.dtype("float16")) _1059: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_25_fc2_weight3, alloc1060, model_decoder_layers_25_fc2_bias3, alloc1061) R.vm.kill_object(alloc1060) R.vm.kill_object(model_decoder_layers_25_fc2_weight3) R.vm.kill_object(model_decoder_layers_25_fc2_bias3) gv1782: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1062: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1782, R.dtype("float16")) cls.add(alloc1058, alloc1061, alloc1062) R.vm.kill_object(alloc1058) R.vm.kill_object(alloc1061) model_decoder_layers_26_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1120] model_decoder_layers_26_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1121] gv1783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1063: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1783, R.dtype("float16")) cls.layer_norm(alloc1062, model_decoder_layers_26_self_attn_layer_norm_weight3, model_decoder_layers_26_self_attn_layer_norm_bias3, alloc1063) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias3) model_decoder_layers_26_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] model_decoder_layers_26_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1117] gv1784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1064: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1784, R.dtype("float16")) _1062: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_q_proj_weight3, alloc1063, model_decoder_layers_26_self_attn_q_proj_bias3, alloc1064) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias3) gv1785: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape970: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1064, gv1785, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1064) model_decoder_layers_26_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] gv1786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1065: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1786, R.dtype("float16")) _1063: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_26_self_attn_k_proj_weight3, alloc1063, alloc1065) R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight3) gv1787: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape971: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1065, gv1787, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1065) model_decoder_layers_26_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] model_decoder_layers_26_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1115] gv1788: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1066: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1788, R.dtype("float16")) _1064: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_v_proj_weight3, alloc1063, model_decoder_layers_26_self_attn_v_proj_bias3, alloc1066) R.vm.kill_object(alloc1063) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias3) gv1789: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape972: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1066, gv1789, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1066) gv1790: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1067: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1790, R.dtype("float16")) cls.concatenate(reshape970, reshape971, reshape972, alloc1067) R.vm.kill_object(reshape970) R.vm.kill_object(reshape971) R.vm.kill_object(reshape972) gv1791: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape973: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1067, gv1791, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1067) gv1792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1068: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1792, R.dtype("float16")) _1066: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape973, alloc1068) R.vm.kill_object(reshape973) gv1793: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape974: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1068, gv1793, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1068) gv1794: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape975: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape974, gv1794, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape974) model_decoder_layers_26_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] model_decoder_layers_26_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1119] gv1795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1069: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1795, R.dtype("float16")) _1067: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_out_proj_weight3, reshape975, model_decoder_layers_26_self_attn_out_proj_bias3, alloc1069) R.vm.kill_object(reshape975) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias3) gv1796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1070: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1796, R.dtype("float16")) cls.add(alloc1062, alloc1069, alloc1070) R.vm.kill_object(alloc1062) R.vm.kill_object(alloc1069) model_decoder_layers_26_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1129] model_decoder_layers_26_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1130] gv1797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1071: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1797, R.dtype("float16")) cls.layer_norm(alloc1070, model_decoder_layers_26_encoder_attn_layer_norm_weight3, model_decoder_layers_26_encoder_attn_layer_norm_bias3, alloc1071) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias3) model_decoder_layers_26_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] model_decoder_layers_26_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1126] gv1798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1072: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1798, R.dtype("float16")) _1070: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight3, alloc1071, model_decoder_layers_26_encoder_attn_q_proj_bias3, alloc1072) R.vm.kill_object(alloc1071) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias3) gv1799: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape976: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1072, gv1799, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1072) gv1800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape977: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape976, gv1800, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape976) gv1801: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1073: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1801, R.dtype("float16")) _1071: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape977, alloc1073) R.vm.kill_object(reshape977) gv1802: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape978: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1073, gv1802, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1073) gv1803: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape979: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape978, gv1803, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape978) model_decoder_layers_26_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] model_decoder_layers_26_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1128] gv1804: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1074: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1804, R.dtype("float16")) _1072: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight3, reshape979, model_decoder_layers_26_encoder_attn_out_proj_bias3, alloc1074) R.vm.kill_object(reshape979) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias3) gv1805: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1075: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1805, R.dtype("float16")) cls.add(alloc1070, alloc1074, alloc1075) R.vm.kill_object(alloc1070) R.vm.kill_object(alloc1074) model_decoder_layers_26_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1135] model_decoder_layers_26_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1136] gv1806: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1076: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1806, R.dtype("float16")) cls.layer_norm(alloc1075, model_decoder_layers_26_final_layer_norm_weight3, model_decoder_layers_26_final_layer_norm_bias3, alloc1076) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias3) model_decoder_layers_26_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] model_decoder_layers_26_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1132] gv1807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1077: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1807, R.dtype("float16")) _1075: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_26_fc1_weight3, alloc1076, model_decoder_layers_26_fc1_bias3, alloc1077) R.vm.kill_object(alloc1076) R.vm.kill_object(model_decoder_layers_26_fc1_weight3) R.vm.kill_object(model_decoder_layers_26_fc1_bias3) model_decoder_layers_26_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] model_decoder_layers_26_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1134] gv1808: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1078: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1808, R.dtype("float16")) _1076: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_26_fc2_weight3, alloc1077, model_decoder_layers_26_fc2_bias3, alloc1078) R.vm.kill_object(alloc1077) R.vm.kill_object(model_decoder_layers_26_fc2_weight3) R.vm.kill_object(model_decoder_layers_26_fc2_bias3) gv1809: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1079: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1809, R.dtype("float16")) cls.add(alloc1075, alloc1078, alloc1079) R.vm.kill_object(alloc1075) R.vm.kill_object(alloc1078) model_decoder_layers_27_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1144] model_decoder_layers_27_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1145] gv1810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1080: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1810, R.dtype("float16")) cls.layer_norm(alloc1079, model_decoder_layers_27_self_attn_layer_norm_weight3, model_decoder_layers_27_self_attn_layer_norm_bias3, alloc1080) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias3) model_decoder_layers_27_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] model_decoder_layers_27_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1141] gv1811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1081: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1811, R.dtype("float16")) _1079: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_q_proj_weight3, alloc1080, model_decoder_layers_27_self_attn_q_proj_bias3, alloc1081) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias3) gv1812: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape980: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1081, gv1812, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1081) model_decoder_layers_27_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] gv1813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1082: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1813, R.dtype("float16")) _1080: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_27_self_attn_k_proj_weight3, alloc1080, alloc1082) R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight3) gv1814: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape981: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1082, gv1814, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1082) model_decoder_layers_27_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] model_decoder_layers_27_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1139] gv1815: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1083: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1815, R.dtype("float16")) _1081: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_v_proj_weight3, alloc1080, model_decoder_layers_27_self_attn_v_proj_bias3, alloc1083) R.vm.kill_object(alloc1080) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias3) gv1816: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape982: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1083, gv1816, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1083) gv1817: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1084: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1817, R.dtype("float16")) cls.concatenate(reshape980, reshape981, reshape982, alloc1084) R.vm.kill_object(reshape980) R.vm.kill_object(reshape981) R.vm.kill_object(reshape982) gv1818: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape983: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1084, gv1818, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1084) gv1819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1085: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1819, R.dtype("float16")) _1083: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape983, alloc1085) R.vm.kill_object(reshape983) gv1820: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape984: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1085, gv1820, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1085) gv1821: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape985: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape984, gv1821, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape984) model_decoder_layers_27_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] model_decoder_layers_27_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1143] gv1822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1086: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1822, R.dtype("float16")) _1084: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_out_proj_weight3, reshape985, model_decoder_layers_27_self_attn_out_proj_bias3, alloc1086) R.vm.kill_object(reshape985) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias3) gv1823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1087: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1823, R.dtype("float16")) cls.add(alloc1079, alloc1086, alloc1087) R.vm.kill_object(alloc1079) R.vm.kill_object(alloc1086) model_decoder_layers_27_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1153] model_decoder_layers_27_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1154] gv1824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1088: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1824, R.dtype("float16")) cls.layer_norm(alloc1087, model_decoder_layers_27_encoder_attn_layer_norm_weight3, model_decoder_layers_27_encoder_attn_layer_norm_bias3, alloc1088) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias3) model_decoder_layers_27_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] model_decoder_layers_27_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1150] gv1825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1089: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1825, R.dtype("float16")) _1087: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight3, alloc1088, model_decoder_layers_27_encoder_attn_q_proj_bias3, alloc1089) R.vm.kill_object(alloc1088) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias3) gv1826: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape986: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1089, gv1826, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1089) gv1827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape987: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape986, gv1827, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape986) gv1828: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1090: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1828, R.dtype("float16")) _1088: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape987, alloc1090) R.vm.kill_object(reshape987) gv1829: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape988: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1090, gv1829, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1090) gv1830: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape989: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape988, gv1830, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape988) model_decoder_layers_27_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] model_decoder_layers_27_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1152] gv1831: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1091: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1831, R.dtype("float16")) _1089: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight3, reshape989, model_decoder_layers_27_encoder_attn_out_proj_bias3, alloc1091) R.vm.kill_object(reshape989) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias3) gv1832: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1092: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1832, R.dtype("float16")) cls.add(alloc1087, alloc1091, alloc1092) R.vm.kill_object(alloc1087) R.vm.kill_object(alloc1091) model_decoder_layers_27_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1159] model_decoder_layers_27_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1160] gv1833: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1093: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1833, R.dtype("float16")) cls.layer_norm(alloc1092, model_decoder_layers_27_final_layer_norm_weight3, model_decoder_layers_27_final_layer_norm_bias3, alloc1093) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias3) model_decoder_layers_27_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] model_decoder_layers_27_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1156] gv1834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1094: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1834, R.dtype("float16")) _1092: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_27_fc1_weight3, alloc1093, model_decoder_layers_27_fc1_bias3, alloc1094) R.vm.kill_object(alloc1093) R.vm.kill_object(model_decoder_layers_27_fc1_weight3) R.vm.kill_object(model_decoder_layers_27_fc1_bias3) model_decoder_layers_27_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] model_decoder_layers_27_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1158] gv1835: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1095: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1835, R.dtype("float16")) _1093: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_27_fc2_weight3, alloc1094, model_decoder_layers_27_fc2_bias3, alloc1095) R.vm.kill_object(alloc1094) R.vm.kill_object(model_decoder_layers_27_fc2_weight3) R.vm.kill_object(model_decoder_layers_27_fc2_bias3) gv1836: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1096: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1836, R.dtype("float16")) cls.add(alloc1092, alloc1095, alloc1096) R.vm.kill_object(alloc1092) R.vm.kill_object(alloc1095) model_decoder_layers_28_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1168] model_decoder_layers_28_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1169] gv1837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1097: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1837, R.dtype("float16")) cls.layer_norm(alloc1096, model_decoder_layers_28_self_attn_layer_norm_weight3, model_decoder_layers_28_self_attn_layer_norm_bias3, alloc1097) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias3) model_decoder_layers_28_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] model_decoder_layers_28_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1165] gv1838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1098: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1838, R.dtype("float16")) _1096: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_q_proj_weight3, alloc1097, model_decoder_layers_28_self_attn_q_proj_bias3, alloc1098) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias3) gv1839: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape990: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1098, gv1839, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1098) model_decoder_layers_28_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] gv1840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1099: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1840, R.dtype("float16")) _1097: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_28_self_attn_k_proj_weight3, alloc1097, alloc1099) R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight3) gv1841: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape991: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1099, gv1841, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1099) model_decoder_layers_28_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] model_decoder_layers_28_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1163] gv1842: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1842, R.dtype("float16")) _1098: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_v_proj_weight3, alloc1097, model_decoder_layers_28_self_attn_v_proj_bias3, alloc1100) R.vm.kill_object(alloc1097) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias3) gv1843: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape992: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1100, gv1843, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1100) gv1844: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1101: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1844, R.dtype("float16")) cls.concatenate(reshape990, reshape991, reshape992, alloc1101) R.vm.kill_object(reshape990) R.vm.kill_object(reshape991) R.vm.kill_object(reshape992) gv1845: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape993: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1101, gv1845, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1101) gv1846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1846, R.dtype("float16")) _1100: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape993, alloc1102) R.vm.kill_object(reshape993) gv1847: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape994: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1102, gv1847, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1102) gv1848: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape995: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape994, gv1848, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape994) model_decoder_layers_28_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] model_decoder_layers_28_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1167] gv1849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1849, R.dtype("float16")) _1101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_out_proj_weight3, reshape995, model_decoder_layers_28_self_attn_out_proj_bias3, alloc1103) R.vm.kill_object(reshape995) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias3) gv1850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1850, R.dtype("float16")) cls.add(alloc1096, alloc1103, alloc1104) R.vm.kill_object(alloc1096) R.vm.kill_object(alloc1103) model_decoder_layers_28_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1177] model_decoder_layers_28_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1178] gv1851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1851, R.dtype("float16")) cls.layer_norm(alloc1104, model_decoder_layers_28_encoder_attn_layer_norm_weight3, model_decoder_layers_28_encoder_attn_layer_norm_bias3, alloc1105) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias3) model_decoder_layers_28_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] model_decoder_layers_28_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1174] gv1852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1852, R.dtype("float16")) _1104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight3, alloc1105, model_decoder_layers_28_encoder_attn_q_proj_bias3, alloc1106) R.vm.kill_object(alloc1105) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias3) gv1853: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape996: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1106, gv1853, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1106) gv1854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape997: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape996, gv1854, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape996) gv1855: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1855, R.dtype("float16")) _1105: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape997, alloc1107) R.vm.kill_object(reshape997) gv1856: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape998: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1107, gv1856, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1107) gv1857: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape999: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape998, gv1857, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape998) model_decoder_layers_28_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] model_decoder_layers_28_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1176] gv1858: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1108: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1858, R.dtype("float16")) _1106: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight3, reshape999, model_decoder_layers_28_encoder_attn_out_proj_bias3, alloc1108) R.vm.kill_object(reshape999) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias3) gv1859: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1859, R.dtype("float16")) cls.add(alloc1104, alloc1108, alloc1109) R.vm.kill_object(alloc1104) R.vm.kill_object(alloc1108) model_decoder_layers_28_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1183] model_decoder_layers_28_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1184] gv1860: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1860, R.dtype("float16")) cls.layer_norm(alloc1109, model_decoder_layers_28_final_layer_norm_weight3, model_decoder_layers_28_final_layer_norm_bias3, alloc1110) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias3) model_decoder_layers_28_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] model_decoder_layers_28_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1180] gv1861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1861, R.dtype("float16")) _1109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_28_fc1_weight3, alloc1110, model_decoder_layers_28_fc1_bias3, alloc1111) R.vm.kill_object(alloc1110) R.vm.kill_object(model_decoder_layers_28_fc1_weight3) R.vm.kill_object(model_decoder_layers_28_fc1_bias3) model_decoder_layers_28_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] model_decoder_layers_28_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1182] gv1862: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1862, R.dtype("float16")) _1110: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_28_fc2_weight3, alloc1111, model_decoder_layers_28_fc2_bias3, alloc1112) R.vm.kill_object(alloc1111) R.vm.kill_object(model_decoder_layers_28_fc2_weight3) R.vm.kill_object(model_decoder_layers_28_fc2_bias3) gv1863: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1113: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1863, R.dtype("float16")) cls.add(alloc1109, alloc1112, alloc1113) R.vm.kill_object(alloc1109) R.vm.kill_object(alloc1112) model_decoder_layers_29_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1192] model_decoder_layers_29_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1193] gv1864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1864, R.dtype("float16")) cls.layer_norm(alloc1113, model_decoder_layers_29_self_attn_layer_norm_weight3, model_decoder_layers_29_self_attn_layer_norm_bias3, alloc1114) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias3) model_decoder_layers_29_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] model_decoder_layers_29_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1189] gv1865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1865, R.dtype("float16")) _1113: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_q_proj_weight3, alloc1114, model_decoder_layers_29_self_attn_q_proj_bias3, alloc1115) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias3) gv1866: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1000: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1115, gv1866, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1115) model_decoder_layers_29_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] gv1867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1867, R.dtype("float16")) _1114: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_29_self_attn_k_proj_weight3, alloc1114, alloc1116) R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight3) gv1868: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1001: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1116, gv1868, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1116) model_decoder_layers_29_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] model_decoder_layers_29_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1187] gv1869: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1869, R.dtype("float16")) _1115: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_v_proj_weight3, alloc1114, model_decoder_layers_29_self_attn_v_proj_bias3, alloc1117) R.vm.kill_object(alloc1114) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias3) gv1870: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1002: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1117, gv1870, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1117) gv1871: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1118: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1871, R.dtype("float16")) cls.concatenate(reshape1000, reshape1001, reshape1002, alloc1118) R.vm.kill_object(reshape1000) R.vm.kill_object(reshape1001) R.vm.kill_object(reshape1002) gv1872: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1003: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1118, gv1872, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1118) gv1873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1873, R.dtype("float16")) _1117: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1003, alloc1119) R.vm.kill_object(reshape1003) gv1874: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1004: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1119, gv1874, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1119) gv1875: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1005: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1004, gv1875, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape1004) model_decoder_layers_29_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] model_decoder_layers_29_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1191] gv1876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1876, R.dtype("float16")) _1118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_out_proj_weight3, reshape1005, model_decoder_layers_29_self_attn_out_proj_bias3, alloc1120) R.vm.kill_object(reshape1005) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias3) gv1877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1877, R.dtype("float16")) cls.add(alloc1113, alloc1120, alloc1121) R.vm.kill_object(alloc1113) R.vm.kill_object(alloc1120) model_decoder_layers_29_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1201] model_decoder_layers_29_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1202] gv1878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1878, R.dtype("float16")) cls.layer_norm(alloc1121, model_decoder_layers_29_encoder_attn_layer_norm_weight3, model_decoder_layers_29_encoder_attn_layer_norm_bias3, alloc1122) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias3) model_decoder_layers_29_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] model_decoder_layers_29_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1198] gv1879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1879, R.dtype("float16")) _1121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight3, alloc1122, model_decoder_layers_29_encoder_attn_q_proj_bias3, alloc1123) R.vm.kill_object(alloc1122) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias3) gv1880: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1006: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1123, gv1880, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1123) gv1881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1007: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1006, gv1881, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1006) gv1882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1882, R.dtype("float16")) _1122: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1007, alloc1124) R.vm.kill_object(reshape1007) gv1883: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1008: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1124, gv1883, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1124) gv1884: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1009: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1008, gv1884, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape1008) model_decoder_layers_29_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] model_decoder_layers_29_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1200] gv1885: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1125: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1885, R.dtype("float16")) _1123: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight3, reshape1009, model_decoder_layers_29_encoder_attn_out_proj_bias3, alloc1125) R.vm.kill_object(reshape1009) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias3) gv1886: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1886, R.dtype("float16")) cls.add(alloc1121, alloc1125, alloc1126) R.vm.kill_object(alloc1121) R.vm.kill_object(alloc1125) model_decoder_layers_29_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1207] model_decoder_layers_29_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1208] gv1887: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1887, R.dtype("float16")) cls.layer_norm(alloc1126, model_decoder_layers_29_final_layer_norm_weight3, model_decoder_layers_29_final_layer_norm_bias3, alloc1127) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias3) model_decoder_layers_29_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] model_decoder_layers_29_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1204] gv1888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1888, R.dtype("float16")) _1126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_29_fc1_weight3, alloc1127, model_decoder_layers_29_fc1_bias3, alloc1128) R.vm.kill_object(alloc1127) R.vm.kill_object(model_decoder_layers_29_fc1_weight3) R.vm.kill_object(model_decoder_layers_29_fc1_bias3) model_decoder_layers_29_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] model_decoder_layers_29_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1206] gv1889: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1889, R.dtype("float16")) _1127: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_29_fc2_weight3, alloc1128, model_decoder_layers_29_fc2_bias3, alloc1129) R.vm.kill_object(alloc1128) R.vm.kill_object(model_decoder_layers_29_fc2_weight3) R.vm.kill_object(model_decoder_layers_29_fc2_bias3) gv1890: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1130: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1890, R.dtype("float16")) cls.add(alloc1126, alloc1129, alloc1130) R.vm.kill_object(alloc1126) R.vm.kill_object(alloc1129) model_decoder_layers_30_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1216] model_decoder_layers_30_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1217] gv1891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1891, R.dtype("float16")) cls.layer_norm(alloc1130, model_decoder_layers_30_self_attn_layer_norm_weight3, model_decoder_layers_30_self_attn_layer_norm_bias3, alloc1131) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias3) model_decoder_layers_30_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] model_decoder_layers_30_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1213] gv1892: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1892, R.dtype("float16")) _1130: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_q_proj_weight3, alloc1131, model_decoder_layers_30_self_attn_q_proj_bias3, alloc1132) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias3) gv1893: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1010: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1132, gv1893, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1132) model_decoder_layers_30_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] gv1894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1894, R.dtype("float16")) _1131: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_30_self_attn_k_proj_weight3, alloc1131, alloc1133) R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight3) gv1895: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1011: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1133, gv1895, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1133) model_decoder_layers_30_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] model_decoder_layers_30_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1211] gv1896: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1896, R.dtype("float16")) _1132: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_v_proj_weight3, alloc1131, model_decoder_layers_30_self_attn_v_proj_bias3, alloc1134) R.vm.kill_object(alloc1131) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias3) gv1897: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1012: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1134, gv1897, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1134) gv1898: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1135: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1898, R.dtype("float16")) cls.concatenate(reshape1010, reshape1011, reshape1012, alloc1135) R.vm.kill_object(reshape1010) R.vm.kill_object(reshape1011) R.vm.kill_object(reshape1012) gv1899: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1013: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1135, gv1899, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1135) gv1900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1900, R.dtype("float16")) _1134: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1013, alloc1136) R.vm.kill_object(reshape1013) gv1901: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1014: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1136, gv1901, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1136) gv1902: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1015: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1014, gv1902, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape1014) model_decoder_layers_30_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] model_decoder_layers_30_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1215] gv1903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1903, R.dtype("float16")) _1135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_out_proj_weight3, reshape1015, model_decoder_layers_30_self_attn_out_proj_bias3, alloc1137) R.vm.kill_object(reshape1015) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias3) gv1904: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1904, R.dtype("float16")) cls.add(alloc1130, alloc1137, alloc1138) R.vm.kill_object(alloc1130) R.vm.kill_object(alloc1137) model_decoder_layers_30_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1225] model_decoder_layers_30_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1226] gv1905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1905, R.dtype("float16")) cls.layer_norm(alloc1138, model_decoder_layers_30_encoder_attn_layer_norm_weight3, model_decoder_layers_30_encoder_attn_layer_norm_bias3, alloc1139) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias3) model_decoder_layers_30_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] model_decoder_layers_30_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1222] gv1906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1906, R.dtype("float16")) _1138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight3, alloc1139, model_decoder_layers_30_encoder_attn_q_proj_bias3, alloc1140) R.vm.kill_object(alloc1139) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias3) gv1907: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1016: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1140, gv1907, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1140) gv1908: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1017: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1016, gv1908, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1016) gv1909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1909, R.dtype("float16")) _1139: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1017, alloc1141) R.vm.kill_object(reshape1017) gv1910: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1018: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1141, gv1910, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1141) gv1911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1019: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1018, gv1911, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape1018) model_decoder_layers_30_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] model_decoder_layers_30_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1224] gv1912: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1142: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1912, R.dtype("float16")) _1140: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight3, reshape1019, model_decoder_layers_30_encoder_attn_out_proj_bias3, alloc1142) R.vm.kill_object(reshape1019) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias3) gv1913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1913, R.dtype("float16")) cls.add(alloc1138, alloc1142, alloc1143) R.vm.kill_object(alloc1138) R.vm.kill_object(alloc1142) model_decoder_layers_30_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1231] model_decoder_layers_30_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1232] gv1914: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1914, R.dtype("float16")) cls.layer_norm(alloc1143, model_decoder_layers_30_final_layer_norm_weight3, model_decoder_layers_30_final_layer_norm_bias3, alloc1144) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias3) model_decoder_layers_30_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] model_decoder_layers_30_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1228] gv1915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1915, R.dtype("float16")) _1143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_30_fc1_weight3, alloc1144, model_decoder_layers_30_fc1_bias3, alloc1145) R.vm.kill_object(alloc1144) R.vm.kill_object(model_decoder_layers_30_fc1_weight3) R.vm.kill_object(model_decoder_layers_30_fc1_bias3) model_decoder_layers_30_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] model_decoder_layers_30_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1230] gv1916: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1916, R.dtype("float16")) _1144: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_30_fc2_weight3, alloc1145, model_decoder_layers_30_fc2_bias3, alloc1146) R.vm.kill_object(alloc1145) R.vm.kill_object(model_decoder_layers_30_fc2_weight3) R.vm.kill_object(model_decoder_layers_30_fc2_bias3) gv1917: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1147: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1917, R.dtype("float16")) cls.add(alloc1143, alloc1146, alloc1147) R.vm.kill_object(alloc1143) R.vm.kill_object(alloc1146) model_decoder_layers_31_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1240] model_decoder_layers_31_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1241] gv1918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1918, R.dtype("float16")) cls.layer_norm(alloc1147, model_decoder_layers_31_self_attn_layer_norm_weight3, model_decoder_layers_31_self_attn_layer_norm_bias3, alloc1148) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias3) model_decoder_layers_31_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] model_decoder_layers_31_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1237] gv1919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1919, R.dtype("float16")) _1147: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_q_proj_weight3, alloc1148, model_decoder_layers_31_self_attn_q_proj_bias3, alloc1149) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias3) gv1920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1020: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1149, gv1920, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1149) model_decoder_layers_31_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] gv1921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1921, R.dtype("float16")) _1148: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_31_self_attn_k_proj_weight3, alloc1148, alloc1150) R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight3) gv1922: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1021: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1150, gv1922, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1150) model_decoder_layers_31_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] model_decoder_layers_31_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1235] gv1923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1923, R.dtype("float16")) _1149: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_v_proj_weight3, alloc1148, model_decoder_layers_31_self_attn_v_proj_bias3, alloc1151) R.vm.kill_object(alloc1148) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight3) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias3) gv1924: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1022: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1151, gv1924, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1151) gv1925: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1152: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1925, R.dtype("float16")) cls.concatenate(reshape1020, reshape1021, reshape1022, alloc1152) R.vm.kill_object(reshape1020) R.vm.kill_object(reshape1021) R.vm.kill_object(reshape1022) gv1926: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1023: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1152, gv1926, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1152) gv1927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1927, R.dtype("float16")) _1151: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1023, alloc1153) R.vm.kill_object(reshape1023) gv1928: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1024: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1153, gv1928, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1153) gv1929: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1025: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1024, gv1929, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape1024) model_decoder_layers_31_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] model_decoder_layers_31_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1239] gv1930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1930, R.dtype("float16")) _1152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_out_proj_weight3, reshape1025, model_decoder_layers_31_self_attn_out_proj_bias3, alloc1154) R.vm.kill_object(reshape1025) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias3) gv1931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1931, R.dtype("float16")) cls.add(alloc1147, alloc1154, alloc1155) R.vm.kill_object(alloc1147) R.vm.kill_object(alloc1154) model_decoder_layers_31_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1249] model_decoder_layers_31_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1250] gv1932: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1932, R.dtype("float16")) cls.layer_norm(alloc1155, model_decoder_layers_31_encoder_attn_layer_norm_weight3, model_decoder_layers_31_encoder_attn_layer_norm_bias3, alloc1156) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias3) model_decoder_layers_31_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] model_decoder_layers_31_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1246] gv1933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1933, R.dtype("float16")) _1155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight3, alloc1156, model_decoder_layers_31_encoder_attn_q_proj_bias3, alloc1157) R.vm.kill_object(alloc1156) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight3) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias3) gv1934: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1026: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1157, gv1934, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1157) gv1935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1027: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1026, gv1935, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1026) gv1936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1936, R.dtype("float16")) _1156: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1027, alloc1158) R.vm.kill_object(reshape1027) gv1937: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1028: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1158, gv1937, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1158) gv1938: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1029: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1028, gv1938, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) R.vm.kill_object(reshape1028) model_decoder_layers_31_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] model_decoder_layers_31_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1248] gv1939: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1159: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1939, R.dtype("float16")) _1157: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight3, reshape1029, model_decoder_layers_31_encoder_attn_out_proj_bias3, alloc1159) R.vm.kill_object(reshape1029) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight3) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias3) gv1940: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1940, R.dtype("float16")) R.vm.kill_object(storage15) cls.add(alloc1155, alloc1159, alloc1160) R.vm.kill_object(alloc1155) R.vm.kill_object(alloc1159) model_decoder_layers_31_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1255] model_decoder_layers_31_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1256] gv1941: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1941, R.dtype("float16")) cls.layer_norm(alloc1160, model_decoder_layers_31_final_layer_norm_weight3, model_decoder_layers_31_final_layer_norm_bias3, alloc1161) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight3) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias3) model_decoder_layers_31_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] model_decoder_layers_31_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1252] gv1942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1942, R.dtype("float16")) R.vm.kill_object(storage13) _1160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_31_fc1_weight3, alloc1161, model_decoder_layers_31_fc1_bias3, alloc1162) R.vm.kill_object(alloc1161) R.vm.kill_object(model_decoder_layers_31_fc1_weight3) R.vm.kill_object(model_decoder_layers_31_fc1_bias3) model_decoder_layers_31_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] model_decoder_layers_31_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1254] gv1943: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1943, R.dtype("float16")) R.vm.kill_object(storage14) _1161: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_31_fc2_weight3, alloc1162, model_decoder_layers_31_fc2_bias3, alloc1163) R.vm.kill_object(alloc1162) R.vm.kill_object(model_decoder_layers_31_fc2_weight3) R.vm.kill_object(model_decoder_layers_31_fc2_bias3) gv1944: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1164: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1944, R.dtype("float16")) R.vm.kill_object(storage16) cls.add(alloc1160, alloc1163, alloc1164) R.vm.kill_object(alloc1160) R.vm.kill_object(alloc1163) model_decoder_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1257] model_decoder_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1258] gv1945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1945, R.dtype("float16")) R.vm.kill_object(storage17) cls.layer_norm(alloc1164, model_decoder_layer_norm_weight3, model_decoder_layer_norm_bias3, alloc1165) R.vm.kill_object(alloc1164) R.vm.kill_object(model_decoder_layer_norm_weight3) R.vm.kill_object(model_decoder_layer_norm_bias3) storage18: R.Object = R.vm.alloc_storage(R.shape([1659712]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1946: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(51866), sinfo_args=(R.Shape(ndim=3),)) alloc1166: R.Tensor(dtype="float32", ndim=3) = R.vm.alloc_tensor(storage18, R.prim_value(0), gv1946, R.dtype("float32")) R.vm.kill_object(storage18) _1164: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul4_cublas", model_decoder_embed_tokens_weight3, alloc1165, alloc1166) R.vm.kill_object(model_decoder_embed_tokens_weight3) R.vm.kill_object(alloc1165) R.call_packed("vm.builtin.match_shape", alloc1166, shape_heap, R.prim_value(3), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(51866), R.str("ErrorContext(fn=batch_decode, loc=return, annotation=R.Tensor((batch_size, 1, 51866), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) return alloc1166 @R.function def batch_encode(input_features: R.Tensor(("batch_size", 128, 3000), dtype="float16"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor(("batch_size", 1500, 1280), dtype="float16"): batch_size = T.int64() R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", input_features, R.prim_value(3), R.dtype("float16"), R.str("ErrorContext(fn=batch_encode, loc=param[0], param=input_features, annotation=R.Tensor((batch_size, 128, 3000), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_encode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", input_features, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(128), R.prim_value(0), R.prim_value(3000), R.str("ErrorContext(fn=batch_encode, loc=param[0], param=input_features, annotation=R.Tensor((batch_size, 128, 3000), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) cls.shape_func1(shape_heap) lv: R.Tensor((1280,), dtype="float16") = packed_params[1] lv1: R.Tensor((1, 1280, 1), dtype="float16") = R.call_packed("vm.builtin.reshape", lv, R.shape([1, 1280, 1]), sinfo_args=(R.Tensor((1, 1280, 1), dtype="float16"),)) R.vm.kill_object(lv) lv2: R.Tensor((1280,), dtype="float16") = packed_params[3] lv3: R.Tensor((1, 1280, 1), dtype="float16") = R.call_packed("vm.builtin.reshape", lv2, R.shape([1, 1280, 1]), sinfo_args=(R.Tensor((1, 1280, 1), dtype="float16"),)) R.vm.kill_object(lv2) model_encoder_conv1_weight: R.Tensor((1280, 128, 3), dtype="float16") = packed_params[0] storage24: R.Object = R.vm.alloc_storage(R.shape([122880000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1947: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), R.prim_value(0), R.prim_value(3000), sinfo_args=(R.Shape(ndim=3),)) alloc1620: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1947, R.dtype("float16")) cls.fused_conv1d_add1_gelu(input_features, model_encoder_conv1_weight, lv1, alloc1620) R.vm.kill_object(lv1) R.vm.kill_object(model_encoder_conv1_weight) model_encoder_conv2_weight: R.Tensor((1280, 1280, 3), dtype="float16") = packed_params[2] storage25: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), R.prim_value(0), R.prim_value(1500), sinfo_args=(R.Shape(ndim=3),)) alloc1621: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1948, R.dtype("float16")) cls.fused_conv1d1_add2_gelu1(alloc1620, model_encoder_conv2_weight, lv3, alloc1621) R.vm.kill_object(lv3) R.vm.kill_object(alloc1620) R.vm.kill_object(model_encoder_conv2_weight) lv6: R.Tensor((1500, 1280), dtype="float16") = packed_params[4] gv1949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1622: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1949, R.dtype("float16")) cls.fused_transpose_add3(lv6, alloc1621, alloc1622) R.vm.kill_object(alloc1621) R.vm.kill_object(lv6) model_encoder_layers_0_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[12] model_encoder_layers_0_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[13] gv1950: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1623: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1950, R.dtype("float16")) cls.layer_norm1(alloc1622, model_encoder_layers_0_self_attn_layer_norm_weight, model_encoder_layers_0_self_attn_layer_norm_bias, alloc1623) R.vm.kill_object(model_encoder_layers_0_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_0_self_attn_layer_norm_bias) model_encoder_layers_0_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[8] model_encoder_layers_0_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[9] storage26: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1624: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1951, R.dtype("float16")) _1622: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_q_proj_weight, alloc1623, model_encoder_layers_0_self_attn_q_proj_bias, alloc1624) R.vm.kill_object(model_encoder_layers_0_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_0_self_attn_q_proj_bias) gv1952: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1624, gv1952, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1624) model_encoder_layers_0_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[5] storage27: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1953: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1625: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1953, R.dtype("float16")) _1623: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_0_self_attn_k_proj_weight, alloc1623, alloc1625) R.vm.kill_object(model_encoder_layers_0_self_attn_k_proj_weight) gv1954: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1625, gv1954, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1625) model_encoder_layers_0_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[6] model_encoder_layers_0_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[7] storage28: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1626: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1955, R.dtype("float16")) _1624: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_v_proj_weight, alloc1623, model_encoder_layers_0_self_attn_v_proj_bias, alloc1626) R.vm.kill_object(alloc1623) R.vm.kill_object(model_encoder_layers_0_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_0_self_attn_v_proj_bias) gv1956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape2: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1626, gv1956, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1626) gv1957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape3: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape, gv1957, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape) gv1958: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape4: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1, gv1958, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1) gv1959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape5: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape2, gv1959, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape2) gv1960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1627: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1960, R.dtype("float16")) _1625: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape3, reshape4, reshape5, alloc1627) R.vm.kill_object(reshape3) R.vm.kill_object(reshape4) R.vm.kill_object(reshape5) gv1961: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape6: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1627, gv1961, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1627) gv1962: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape7: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape6, gv1962, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape6) model_encoder_layers_0_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[10] model_encoder_layers_0_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[11] gv1963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1628: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1963, R.dtype("float16")) _1626: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_out_proj_weight, reshape7, model_encoder_layers_0_self_attn_out_proj_bias, alloc1628) R.vm.kill_object(reshape7) R.vm.kill_object(model_encoder_layers_0_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_0_self_attn_out_proj_bias) gv1964: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1629: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1964, R.dtype("float16")) cls.add4(alloc1622, alloc1628, alloc1629) R.vm.kill_object(alloc1622) R.vm.kill_object(alloc1628) model_encoder_layers_0_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[18] model_encoder_layers_0_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[19] gv1965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1630: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1965, R.dtype("float16")) cls.layer_norm1(alloc1629, model_encoder_layers_0_final_layer_norm_weight, model_encoder_layers_0_final_layer_norm_bias, alloc1630) R.vm.kill_object(model_encoder_layers_0_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_0_final_layer_norm_bias) model_encoder_layers_0_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[14] model_encoder_layers_0_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[15] gv1966: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1631: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1966, R.dtype("float16")) _1629: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_0_fc1_weight, alloc1630, model_encoder_layers_0_fc1_bias, alloc1631) R.vm.kill_object(alloc1630) R.vm.kill_object(model_encoder_layers_0_fc1_weight) R.vm.kill_object(model_encoder_layers_0_fc1_bias) model_encoder_layers_0_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[16] model_encoder_layers_0_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[17] gv1967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1632: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1967, R.dtype("float16")) _1630: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_0_fc2_weight, alloc1631, model_encoder_layers_0_fc2_bias, alloc1632) R.vm.kill_object(alloc1631) R.vm.kill_object(model_encoder_layers_0_fc2_weight) R.vm.kill_object(model_encoder_layers_0_fc2_bias) gv1968: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1633: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1968, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1629, alloc1632, alloc1633) R.vm.kill_object(alloc1629) R.vm.kill_object(alloc1632) model_encoder_layers_1_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[27] model_encoder_layers_1_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[28] gv1969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1634: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1969, R.dtype("float16")) cls.layer_norm1(alloc1633, model_encoder_layers_1_self_attn_layer_norm_weight, model_encoder_layers_1_self_attn_layer_norm_bias, alloc1634) R.vm.kill_object(model_encoder_layers_1_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_1_self_attn_layer_norm_bias) model_encoder_layers_1_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[23] model_encoder_layers_1_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[24] gv1970: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1635: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1970, R.dtype("float16")) _1633: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_q_proj_weight, alloc1634, model_encoder_layers_1_self_attn_q_proj_bias, alloc1635) R.vm.kill_object(model_encoder_layers_1_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_1_self_attn_q_proj_bias) gv1971: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape8: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1635, gv1971, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1635) model_encoder_layers_1_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[20] gv1972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1636: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1972, R.dtype("float16")) _1634: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_1_self_attn_k_proj_weight, alloc1634, alloc1636) R.vm.kill_object(model_encoder_layers_1_self_attn_k_proj_weight) gv1973: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape9: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1636, gv1973, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1636) model_encoder_layers_1_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[21] model_encoder_layers_1_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[22] gv1974: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1637: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1974, R.dtype("float16")) _1635: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_v_proj_weight, alloc1634, model_encoder_layers_1_self_attn_v_proj_bias, alloc1637) R.vm.kill_object(alloc1634) R.vm.kill_object(model_encoder_layers_1_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_1_self_attn_v_proj_bias) gv1975: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape10: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1637, gv1975, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1637) gv1976: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape11: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape8, gv1976, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape8) gv1977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape12: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape9, gv1977, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape9) gv1978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape13: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape10, gv1978, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape10) gv1979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1638: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1979, R.dtype("float16")) _1636: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape11, reshape12, reshape13, alloc1638) R.vm.kill_object(reshape11) R.vm.kill_object(reshape12) R.vm.kill_object(reshape13) gv1980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape14: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1638, gv1980, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1638) gv1981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape15: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape14, gv1981, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape14) model_encoder_layers_1_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[25] model_encoder_layers_1_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[26] gv1982: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1639: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1982, R.dtype("float16")) _1637: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_out_proj_weight, reshape15, model_encoder_layers_1_self_attn_out_proj_bias, alloc1639) R.vm.kill_object(reshape15) R.vm.kill_object(model_encoder_layers_1_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_1_self_attn_out_proj_bias) gv1983: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1640: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1983, R.dtype("float16")) cls.add4(alloc1633, alloc1639, alloc1640) R.vm.kill_object(alloc1633) R.vm.kill_object(alloc1639) model_encoder_layers_1_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[33] model_encoder_layers_1_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[34] gv1984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1641: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1984, R.dtype("float16")) cls.layer_norm1(alloc1640, model_encoder_layers_1_final_layer_norm_weight, model_encoder_layers_1_final_layer_norm_bias, alloc1641) R.vm.kill_object(model_encoder_layers_1_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_1_final_layer_norm_bias) model_encoder_layers_1_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[29] model_encoder_layers_1_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[30] gv1985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1642: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1985, R.dtype("float16")) _1640: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_1_fc1_weight, alloc1641, model_encoder_layers_1_fc1_bias, alloc1642) R.vm.kill_object(alloc1641) R.vm.kill_object(model_encoder_layers_1_fc1_weight) R.vm.kill_object(model_encoder_layers_1_fc1_bias) model_encoder_layers_1_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[31] model_encoder_layers_1_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[32] gv1986: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1643: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1986, R.dtype("float16")) _1641: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_1_fc2_weight, alloc1642, model_encoder_layers_1_fc2_bias, alloc1643) R.vm.kill_object(alloc1642) R.vm.kill_object(model_encoder_layers_1_fc2_weight) R.vm.kill_object(model_encoder_layers_1_fc2_bias) gv1987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1644: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1987, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1640, alloc1643, alloc1644) R.vm.kill_object(alloc1640) R.vm.kill_object(alloc1643) model_encoder_layers_2_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[42] model_encoder_layers_2_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[43] gv1988: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1645: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1988, R.dtype("float16")) cls.layer_norm1(alloc1644, model_encoder_layers_2_self_attn_layer_norm_weight, model_encoder_layers_2_self_attn_layer_norm_bias, alloc1645) R.vm.kill_object(model_encoder_layers_2_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_2_self_attn_layer_norm_bias) model_encoder_layers_2_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[38] model_encoder_layers_2_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[39] gv1989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1646: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1989, R.dtype("float16")) _1644: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_q_proj_weight, alloc1645, model_encoder_layers_2_self_attn_q_proj_bias, alloc1646) R.vm.kill_object(model_encoder_layers_2_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_2_self_attn_q_proj_bias) gv1990: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape16: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1646, gv1990, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1646) model_encoder_layers_2_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[35] gv1991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1647: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1991, R.dtype("float16")) _1645: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_2_self_attn_k_proj_weight, alloc1645, alloc1647) R.vm.kill_object(model_encoder_layers_2_self_attn_k_proj_weight) gv1992: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape17: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1647, gv1992, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1647) model_encoder_layers_2_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[36] model_encoder_layers_2_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[37] gv1993: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1648: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1993, R.dtype("float16")) _1646: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_v_proj_weight, alloc1645, model_encoder_layers_2_self_attn_v_proj_bias, alloc1648) R.vm.kill_object(alloc1645) R.vm.kill_object(model_encoder_layers_2_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_2_self_attn_v_proj_bias) gv1994: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape18: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1648, gv1994, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1648) gv1995: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape19: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape16, gv1995, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape16) gv1996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape20: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape17, gv1996, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape17) gv1997: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape21: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape18, gv1997, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape18) gv1998: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1649: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1998, R.dtype("float16")) _1647: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape19, reshape20, reshape21, alloc1649) R.vm.kill_object(reshape19) R.vm.kill_object(reshape20) R.vm.kill_object(reshape21) gv1999: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape22: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1649, gv1999, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1649) gv2000: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape23: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape22, gv2000, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape22) model_encoder_layers_2_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[40] model_encoder_layers_2_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[41] gv2001: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1650: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2001, R.dtype("float16")) _1648: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_out_proj_weight, reshape23, model_encoder_layers_2_self_attn_out_proj_bias, alloc1650) R.vm.kill_object(reshape23) R.vm.kill_object(model_encoder_layers_2_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_2_self_attn_out_proj_bias) gv2002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1651: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2002, R.dtype("float16")) cls.add4(alloc1644, alloc1650, alloc1651) R.vm.kill_object(alloc1644) R.vm.kill_object(alloc1650) model_encoder_layers_2_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[48] model_encoder_layers_2_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[49] gv2003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1652: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2003, R.dtype("float16")) cls.layer_norm1(alloc1651, model_encoder_layers_2_final_layer_norm_weight, model_encoder_layers_2_final_layer_norm_bias, alloc1652) R.vm.kill_object(model_encoder_layers_2_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_2_final_layer_norm_bias) model_encoder_layers_2_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[44] model_encoder_layers_2_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[45] gv2004: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1653: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2004, R.dtype("float16")) _1651: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_2_fc1_weight, alloc1652, model_encoder_layers_2_fc1_bias, alloc1653) R.vm.kill_object(alloc1652) R.vm.kill_object(model_encoder_layers_2_fc1_weight) R.vm.kill_object(model_encoder_layers_2_fc1_bias) model_encoder_layers_2_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[46] model_encoder_layers_2_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[47] gv2005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1654: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2005, R.dtype("float16")) _1652: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_2_fc2_weight, alloc1653, model_encoder_layers_2_fc2_bias, alloc1654) R.vm.kill_object(alloc1653) R.vm.kill_object(model_encoder_layers_2_fc2_weight) R.vm.kill_object(model_encoder_layers_2_fc2_bias) gv2006: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1655: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2006, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1651, alloc1654, alloc1655) R.vm.kill_object(alloc1651) R.vm.kill_object(alloc1654) model_encoder_layers_3_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[57] model_encoder_layers_3_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[58] gv2007: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1656: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2007, R.dtype("float16")) cls.layer_norm1(alloc1655, model_encoder_layers_3_self_attn_layer_norm_weight, model_encoder_layers_3_self_attn_layer_norm_bias, alloc1656) R.vm.kill_object(model_encoder_layers_3_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_3_self_attn_layer_norm_bias) model_encoder_layers_3_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[53] model_encoder_layers_3_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[54] gv2008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1657: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2008, R.dtype("float16")) _1655: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_q_proj_weight, alloc1656, model_encoder_layers_3_self_attn_q_proj_bias, alloc1657) R.vm.kill_object(model_encoder_layers_3_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_3_self_attn_q_proj_bias) gv2009: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape24: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1657, gv2009, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1657) model_encoder_layers_3_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[50] gv2010: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1658: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2010, R.dtype("float16")) _1656: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_3_self_attn_k_proj_weight, alloc1656, alloc1658) R.vm.kill_object(model_encoder_layers_3_self_attn_k_proj_weight) gv2011: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape25: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1658, gv2011, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1658) model_encoder_layers_3_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[51] model_encoder_layers_3_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[52] gv2012: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1659: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2012, R.dtype("float16")) _1657: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_v_proj_weight, alloc1656, model_encoder_layers_3_self_attn_v_proj_bias, alloc1659) R.vm.kill_object(alloc1656) R.vm.kill_object(model_encoder_layers_3_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_3_self_attn_v_proj_bias) gv2013: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape26: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1659, gv2013, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1659) gv2014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape27: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape24, gv2014, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape24) gv2015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape28: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape25, gv2015, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape25) gv2016: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape29: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape26, gv2016, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape26) gv2017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1660: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2017, R.dtype("float16")) _1658: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape27, reshape28, reshape29, alloc1660) R.vm.kill_object(reshape27) R.vm.kill_object(reshape28) R.vm.kill_object(reshape29) gv2018: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape30: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1660, gv2018, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1660) gv2019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape31: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape30, gv2019, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape30) model_encoder_layers_3_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[55] model_encoder_layers_3_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[56] gv2020: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1661: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2020, R.dtype("float16")) _1659: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_out_proj_weight, reshape31, model_encoder_layers_3_self_attn_out_proj_bias, alloc1661) R.vm.kill_object(reshape31) R.vm.kill_object(model_encoder_layers_3_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_3_self_attn_out_proj_bias) gv2021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1662: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2021, R.dtype("float16")) cls.add4(alloc1655, alloc1661, alloc1662) R.vm.kill_object(alloc1655) R.vm.kill_object(alloc1661) model_encoder_layers_3_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[63] model_encoder_layers_3_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[64] gv2022: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1663: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2022, R.dtype("float16")) cls.layer_norm1(alloc1662, model_encoder_layers_3_final_layer_norm_weight, model_encoder_layers_3_final_layer_norm_bias, alloc1663) R.vm.kill_object(model_encoder_layers_3_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_3_final_layer_norm_bias) model_encoder_layers_3_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[59] model_encoder_layers_3_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[60] gv2023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1664: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2023, R.dtype("float16")) _1662: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_3_fc1_weight, alloc1663, model_encoder_layers_3_fc1_bias, alloc1664) R.vm.kill_object(alloc1663) R.vm.kill_object(model_encoder_layers_3_fc1_weight) R.vm.kill_object(model_encoder_layers_3_fc1_bias) model_encoder_layers_3_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[61] model_encoder_layers_3_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[62] gv2024: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1665: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2024, R.dtype("float16")) _1663: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_3_fc2_weight, alloc1664, model_encoder_layers_3_fc2_bias, alloc1665) R.vm.kill_object(alloc1664) R.vm.kill_object(model_encoder_layers_3_fc2_weight) R.vm.kill_object(model_encoder_layers_3_fc2_bias) gv2025: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1666: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2025, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1662, alloc1665, alloc1666) R.vm.kill_object(alloc1662) R.vm.kill_object(alloc1665) model_encoder_layers_4_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[72] model_encoder_layers_4_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[73] gv2026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1667: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2026, R.dtype("float16")) cls.layer_norm1(alloc1666, model_encoder_layers_4_self_attn_layer_norm_weight, model_encoder_layers_4_self_attn_layer_norm_bias, alloc1667) R.vm.kill_object(model_encoder_layers_4_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_4_self_attn_layer_norm_bias) model_encoder_layers_4_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[68] model_encoder_layers_4_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[69] gv2027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1668: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2027, R.dtype("float16")) _1666: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_q_proj_weight, alloc1667, model_encoder_layers_4_self_attn_q_proj_bias, alloc1668) R.vm.kill_object(model_encoder_layers_4_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_4_self_attn_q_proj_bias) gv2028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape32: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1668, gv2028, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1668) model_encoder_layers_4_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[65] gv2029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1669: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2029, R.dtype("float16")) _1667: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_4_self_attn_k_proj_weight, alloc1667, alloc1669) R.vm.kill_object(model_encoder_layers_4_self_attn_k_proj_weight) gv2030: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape33: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1669, gv2030, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1669) model_encoder_layers_4_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[66] model_encoder_layers_4_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[67] gv2031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1670: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2031, R.dtype("float16")) _1668: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_v_proj_weight, alloc1667, model_encoder_layers_4_self_attn_v_proj_bias, alloc1670) R.vm.kill_object(alloc1667) R.vm.kill_object(model_encoder_layers_4_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_4_self_attn_v_proj_bias) gv2032: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape34: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1670, gv2032, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1670) gv2033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape35: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape32, gv2033, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape32) gv2034: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape36: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape33, gv2034, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape33) gv2035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape37: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape34, gv2035, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape34) gv2036: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1671: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2036, R.dtype("float16")) _1669: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape35, reshape36, reshape37, alloc1671) R.vm.kill_object(reshape35) R.vm.kill_object(reshape36) R.vm.kill_object(reshape37) gv2037: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape38: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1671, gv2037, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1671) gv2038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape39: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape38, gv2038, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape38) model_encoder_layers_4_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[70] model_encoder_layers_4_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[71] gv2039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1672: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2039, R.dtype("float16")) _1670: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_out_proj_weight, reshape39, model_encoder_layers_4_self_attn_out_proj_bias, alloc1672) R.vm.kill_object(reshape39) R.vm.kill_object(model_encoder_layers_4_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_4_self_attn_out_proj_bias) gv2040: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1673: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2040, R.dtype("float16")) cls.add4(alloc1666, alloc1672, alloc1673) R.vm.kill_object(alloc1666) R.vm.kill_object(alloc1672) model_encoder_layers_4_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[78] model_encoder_layers_4_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[79] gv2041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1674: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2041, R.dtype("float16")) cls.layer_norm1(alloc1673, model_encoder_layers_4_final_layer_norm_weight, model_encoder_layers_4_final_layer_norm_bias, alloc1674) R.vm.kill_object(model_encoder_layers_4_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_4_final_layer_norm_bias) model_encoder_layers_4_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[74] model_encoder_layers_4_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[75] gv2042: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1675: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2042, R.dtype("float16")) _1673: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_4_fc1_weight, alloc1674, model_encoder_layers_4_fc1_bias, alloc1675) R.vm.kill_object(alloc1674) R.vm.kill_object(model_encoder_layers_4_fc1_weight) R.vm.kill_object(model_encoder_layers_4_fc1_bias) model_encoder_layers_4_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[76] model_encoder_layers_4_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[77] gv2043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1676: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2043, R.dtype("float16")) _1674: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_4_fc2_weight, alloc1675, model_encoder_layers_4_fc2_bias, alloc1676) R.vm.kill_object(alloc1675) R.vm.kill_object(model_encoder_layers_4_fc2_weight) R.vm.kill_object(model_encoder_layers_4_fc2_bias) gv2044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1677: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2044, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1673, alloc1676, alloc1677) R.vm.kill_object(alloc1673) R.vm.kill_object(alloc1676) model_encoder_layers_5_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[87] model_encoder_layers_5_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[88] gv2045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1678: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2045, R.dtype("float16")) cls.layer_norm1(alloc1677, model_encoder_layers_5_self_attn_layer_norm_weight, model_encoder_layers_5_self_attn_layer_norm_bias, alloc1678) R.vm.kill_object(model_encoder_layers_5_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_5_self_attn_layer_norm_bias) model_encoder_layers_5_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[83] model_encoder_layers_5_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[84] gv2046: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1679: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2046, R.dtype("float16")) _1677: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_q_proj_weight, alloc1678, model_encoder_layers_5_self_attn_q_proj_bias, alloc1679) R.vm.kill_object(model_encoder_layers_5_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_5_self_attn_q_proj_bias) gv2047: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape40: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1679, gv2047, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1679) model_encoder_layers_5_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[80] gv2048: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1680: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2048, R.dtype("float16")) _1678: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_5_self_attn_k_proj_weight, alloc1678, alloc1680) R.vm.kill_object(model_encoder_layers_5_self_attn_k_proj_weight) gv2049: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape41: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1680, gv2049, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1680) model_encoder_layers_5_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[81] model_encoder_layers_5_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[82] gv2050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1681: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2050, R.dtype("float16")) _1679: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_v_proj_weight, alloc1678, model_encoder_layers_5_self_attn_v_proj_bias, alloc1681) R.vm.kill_object(alloc1678) R.vm.kill_object(model_encoder_layers_5_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_5_self_attn_v_proj_bias) gv2051: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape42: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1681, gv2051, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1681) gv2052: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape43: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape40, gv2052, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape40) gv2053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape44: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape41, gv2053, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape41) gv2054: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape45: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape42, gv2054, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape42) gv2055: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1682: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2055, R.dtype("float16")) _1680: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape43, reshape44, reshape45, alloc1682) R.vm.kill_object(reshape43) R.vm.kill_object(reshape44) R.vm.kill_object(reshape45) gv2056: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape46: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1682, gv2056, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1682) gv2057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape47: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape46, gv2057, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape46) model_encoder_layers_5_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[85] model_encoder_layers_5_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[86] gv2058: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1683: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2058, R.dtype("float16")) _1681: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_out_proj_weight, reshape47, model_encoder_layers_5_self_attn_out_proj_bias, alloc1683) R.vm.kill_object(reshape47) R.vm.kill_object(model_encoder_layers_5_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_5_self_attn_out_proj_bias) gv2059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1684: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2059, R.dtype("float16")) cls.add4(alloc1677, alloc1683, alloc1684) R.vm.kill_object(alloc1677) R.vm.kill_object(alloc1683) model_encoder_layers_5_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[93] model_encoder_layers_5_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[94] gv2060: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1685: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2060, R.dtype("float16")) cls.layer_norm1(alloc1684, model_encoder_layers_5_final_layer_norm_weight, model_encoder_layers_5_final_layer_norm_bias, alloc1685) R.vm.kill_object(model_encoder_layers_5_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_5_final_layer_norm_bias) model_encoder_layers_5_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[89] model_encoder_layers_5_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[90] gv2061: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1686: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2061, R.dtype("float16")) _1684: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_5_fc1_weight, alloc1685, model_encoder_layers_5_fc1_bias, alloc1686) R.vm.kill_object(alloc1685) R.vm.kill_object(model_encoder_layers_5_fc1_weight) R.vm.kill_object(model_encoder_layers_5_fc1_bias) model_encoder_layers_5_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[91] model_encoder_layers_5_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[92] gv2062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1687: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2062, R.dtype("float16")) _1685: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_5_fc2_weight, alloc1686, model_encoder_layers_5_fc2_bias, alloc1687) R.vm.kill_object(alloc1686) R.vm.kill_object(model_encoder_layers_5_fc2_weight) R.vm.kill_object(model_encoder_layers_5_fc2_bias) gv2063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1688: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2063, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1684, alloc1687, alloc1688) R.vm.kill_object(alloc1684) R.vm.kill_object(alloc1687) model_encoder_layers_6_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[102] model_encoder_layers_6_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[103] gv2064: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1689: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2064, R.dtype("float16")) cls.layer_norm1(alloc1688, model_encoder_layers_6_self_attn_layer_norm_weight, model_encoder_layers_6_self_attn_layer_norm_bias, alloc1689) R.vm.kill_object(model_encoder_layers_6_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_6_self_attn_layer_norm_bias) model_encoder_layers_6_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[98] model_encoder_layers_6_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[99] gv2065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1690: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2065, R.dtype("float16")) _1688: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_q_proj_weight, alloc1689, model_encoder_layers_6_self_attn_q_proj_bias, alloc1690) R.vm.kill_object(model_encoder_layers_6_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_6_self_attn_q_proj_bias) gv2066: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape48: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1690, gv2066, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1690) model_encoder_layers_6_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[95] gv2067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1691: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2067, R.dtype("float16")) _1689: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_6_self_attn_k_proj_weight, alloc1689, alloc1691) R.vm.kill_object(model_encoder_layers_6_self_attn_k_proj_weight) gv2068: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape49: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1691, gv2068, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1691) model_encoder_layers_6_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[96] model_encoder_layers_6_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[97] gv2069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1692: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2069, R.dtype("float16")) _1690: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_v_proj_weight, alloc1689, model_encoder_layers_6_self_attn_v_proj_bias, alloc1692) R.vm.kill_object(alloc1689) R.vm.kill_object(model_encoder_layers_6_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_6_self_attn_v_proj_bias) gv2070: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape50: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1692, gv2070, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1692) gv2071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape51: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape48, gv2071, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape48) gv2072: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape52: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape49, gv2072, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape49) gv2073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape53: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape50, gv2073, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape50) gv2074: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1693: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2074, R.dtype("float16")) _1691: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape51, reshape52, reshape53, alloc1693) R.vm.kill_object(reshape51) R.vm.kill_object(reshape52) R.vm.kill_object(reshape53) gv2075: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape54: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1693, gv2075, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1693) gv2076: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape55: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape54, gv2076, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape54) model_encoder_layers_6_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[100] model_encoder_layers_6_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[101] gv2077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1694: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2077, R.dtype("float16")) _1692: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_out_proj_weight, reshape55, model_encoder_layers_6_self_attn_out_proj_bias, alloc1694) R.vm.kill_object(reshape55) R.vm.kill_object(model_encoder_layers_6_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_6_self_attn_out_proj_bias) gv2078: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1695: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2078, R.dtype("float16")) cls.add4(alloc1688, alloc1694, alloc1695) R.vm.kill_object(alloc1688) R.vm.kill_object(alloc1694) model_encoder_layers_6_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[108] model_encoder_layers_6_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[109] gv2079: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1696: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2079, R.dtype("float16")) cls.layer_norm1(alloc1695, model_encoder_layers_6_final_layer_norm_weight, model_encoder_layers_6_final_layer_norm_bias, alloc1696) R.vm.kill_object(model_encoder_layers_6_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_6_final_layer_norm_bias) model_encoder_layers_6_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[104] model_encoder_layers_6_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[105] gv2080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1697: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2080, R.dtype("float16")) _1695: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_6_fc1_weight, alloc1696, model_encoder_layers_6_fc1_bias, alloc1697) R.vm.kill_object(alloc1696) R.vm.kill_object(model_encoder_layers_6_fc1_weight) R.vm.kill_object(model_encoder_layers_6_fc1_bias) model_encoder_layers_6_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[106] model_encoder_layers_6_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[107] gv2081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1698: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2081, R.dtype("float16")) _1696: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_6_fc2_weight, alloc1697, model_encoder_layers_6_fc2_bias, alloc1698) R.vm.kill_object(alloc1697) R.vm.kill_object(model_encoder_layers_6_fc2_weight) R.vm.kill_object(model_encoder_layers_6_fc2_bias) gv2082: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1699: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2082, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1695, alloc1698, alloc1699) R.vm.kill_object(alloc1695) R.vm.kill_object(alloc1698) model_encoder_layers_7_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[117] model_encoder_layers_7_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[118] gv2083: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1700: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2083, R.dtype("float16")) cls.layer_norm1(alloc1699, model_encoder_layers_7_self_attn_layer_norm_weight, model_encoder_layers_7_self_attn_layer_norm_bias, alloc1700) R.vm.kill_object(model_encoder_layers_7_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_7_self_attn_layer_norm_bias) model_encoder_layers_7_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[113] model_encoder_layers_7_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[114] gv2084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1701: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2084, R.dtype("float16")) _1699: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_q_proj_weight, alloc1700, model_encoder_layers_7_self_attn_q_proj_bias, alloc1701) R.vm.kill_object(model_encoder_layers_7_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_7_self_attn_q_proj_bias) gv2085: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape56: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1701, gv2085, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1701) model_encoder_layers_7_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[110] gv2086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1702: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2086, R.dtype("float16")) _1700: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_7_self_attn_k_proj_weight, alloc1700, alloc1702) R.vm.kill_object(model_encoder_layers_7_self_attn_k_proj_weight) gv2087: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape57: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1702, gv2087, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1702) model_encoder_layers_7_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[111] model_encoder_layers_7_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[112] gv2088: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1703: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2088, R.dtype("float16")) _1701: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_v_proj_weight, alloc1700, model_encoder_layers_7_self_attn_v_proj_bias, alloc1703) R.vm.kill_object(alloc1700) R.vm.kill_object(model_encoder_layers_7_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_7_self_attn_v_proj_bias) gv2089: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape58: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1703, gv2089, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1703) gv2090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape59: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape56, gv2090, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape56) gv2091: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape60: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape57, gv2091, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape57) gv2092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape61: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape58, gv2092, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape58) gv2093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1704: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2093, R.dtype("float16")) _1702: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape59, reshape60, reshape61, alloc1704) R.vm.kill_object(reshape59) R.vm.kill_object(reshape60) R.vm.kill_object(reshape61) gv2094: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape62: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1704, gv2094, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1704) gv2095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape63: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape62, gv2095, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape62) model_encoder_layers_7_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[115] model_encoder_layers_7_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[116] gv2096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1705: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2096, R.dtype("float16")) _1703: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_out_proj_weight, reshape63, model_encoder_layers_7_self_attn_out_proj_bias, alloc1705) R.vm.kill_object(reshape63) R.vm.kill_object(model_encoder_layers_7_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_7_self_attn_out_proj_bias) gv2097: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1706: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2097, R.dtype("float16")) cls.add4(alloc1699, alloc1705, alloc1706) R.vm.kill_object(alloc1699) R.vm.kill_object(alloc1705) model_encoder_layers_7_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[123] model_encoder_layers_7_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[124] gv2098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1707: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2098, R.dtype("float16")) cls.layer_norm1(alloc1706, model_encoder_layers_7_final_layer_norm_weight, model_encoder_layers_7_final_layer_norm_bias, alloc1707) R.vm.kill_object(model_encoder_layers_7_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_7_final_layer_norm_bias) model_encoder_layers_7_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[119] model_encoder_layers_7_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[120] gv2099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1708: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2099, R.dtype("float16")) _1706: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_7_fc1_weight, alloc1707, model_encoder_layers_7_fc1_bias, alloc1708) R.vm.kill_object(alloc1707) R.vm.kill_object(model_encoder_layers_7_fc1_weight) R.vm.kill_object(model_encoder_layers_7_fc1_bias) model_encoder_layers_7_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[121] model_encoder_layers_7_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[122] gv2100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1709: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2100, R.dtype("float16")) _1707: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_7_fc2_weight, alloc1708, model_encoder_layers_7_fc2_bias, alloc1709) R.vm.kill_object(alloc1708) R.vm.kill_object(model_encoder_layers_7_fc2_weight) R.vm.kill_object(model_encoder_layers_7_fc2_bias) gv2101: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1710: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2101, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1706, alloc1709, alloc1710) R.vm.kill_object(alloc1706) R.vm.kill_object(alloc1709) model_encoder_layers_8_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[132] model_encoder_layers_8_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[133] gv2102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1711: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2102, R.dtype("float16")) cls.layer_norm1(alloc1710, model_encoder_layers_8_self_attn_layer_norm_weight, model_encoder_layers_8_self_attn_layer_norm_bias, alloc1711) R.vm.kill_object(model_encoder_layers_8_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_8_self_attn_layer_norm_bias) model_encoder_layers_8_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[128] model_encoder_layers_8_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[129] gv2103: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1712: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2103, R.dtype("float16")) _1710: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_q_proj_weight, alloc1711, model_encoder_layers_8_self_attn_q_proj_bias, alloc1712) R.vm.kill_object(model_encoder_layers_8_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_8_self_attn_q_proj_bias) gv2104: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape64: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1712, gv2104, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1712) model_encoder_layers_8_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[125] gv2105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1713: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2105, R.dtype("float16")) _1711: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_8_self_attn_k_proj_weight, alloc1711, alloc1713) R.vm.kill_object(model_encoder_layers_8_self_attn_k_proj_weight) gv2106: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape65: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1713, gv2106, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1713) model_encoder_layers_8_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[126] model_encoder_layers_8_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[127] gv2107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1714: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2107, R.dtype("float16")) _1712: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_v_proj_weight, alloc1711, model_encoder_layers_8_self_attn_v_proj_bias, alloc1714) R.vm.kill_object(alloc1711) R.vm.kill_object(model_encoder_layers_8_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_8_self_attn_v_proj_bias) gv2108: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape66: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1714, gv2108, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1714) gv2109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape67: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape64, gv2109, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape64) gv2110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape68: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape65, gv2110, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape65) gv2111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape69: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape66, gv2111, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape66) gv2112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1715: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2112, R.dtype("float16")) _1713: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape67, reshape68, reshape69, alloc1715) R.vm.kill_object(reshape67) R.vm.kill_object(reshape68) R.vm.kill_object(reshape69) gv2113: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape70: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1715, gv2113, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1715) gv2114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape71: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape70, gv2114, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape70) model_encoder_layers_8_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[130] model_encoder_layers_8_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[131] gv2115: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1716: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2115, R.dtype("float16")) _1714: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_out_proj_weight, reshape71, model_encoder_layers_8_self_attn_out_proj_bias, alloc1716) R.vm.kill_object(reshape71) R.vm.kill_object(model_encoder_layers_8_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_8_self_attn_out_proj_bias) gv2116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1717: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2116, R.dtype("float16")) cls.add4(alloc1710, alloc1716, alloc1717) R.vm.kill_object(alloc1710) R.vm.kill_object(alloc1716) model_encoder_layers_8_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[138] model_encoder_layers_8_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[139] gv2117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1718: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2117, R.dtype("float16")) cls.layer_norm1(alloc1717, model_encoder_layers_8_final_layer_norm_weight, model_encoder_layers_8_final_layer_norm_bias, alloc1718) R.vm.kill_object(model_encoder_layers_8_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_8_final_layer_norm_bias) model_encoder_layers_8_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[134] model_encoder_layers_8_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[135] gv2118: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1719: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2118, R.dtype("float16")) _1717: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_8_fc1_weight, alloc1718, model_encoder_layers_8_fc1_bias, alloc1719) R.vm.kill_object(alloc1718) R.vm.kill_object(model_encoder_layers_8_fc1_weight) R.vm.kill_object(model_encoder_layers_8_fc1_bias) model_encoder_layers_8_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[136] model_encoder_layers_8_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[137] gv2119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1720: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2119, R.dtype("float16")) _1718: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_8_fc2_weight, alloc1719, model_encoder_layers_8_fc2_bias, alloc1720) R.vm.kill_object(alloc1719) R.vm.kill_object(model_encoder_layers_8_fc2_weight) R.vm.kill_object(model_encoder_layers_8_fc2_bias) gv2120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1721: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2120, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1717, alloc1720, alloc1721) R.vm.kill_object(alloc1717) R.vm.kill_object(alloc1720) model_encoder_layers_9_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[147] model_encoder_layers_9_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[148] gv2121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1722: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2121, R.dtype("float16")) cls.layer_norm1(alloc1721, model_encoder_layers_9_self_attn_layer_norm_weight, model_encoder_layers_9_self_attn_layer_norm_bias, alloc1722) R.vm.kill_object(model_encoder_layers_9_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_9_self_attn_layer_norm_bias) model_encoder_layers_9_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[143] model_encoder_layers_9_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[144] gv2122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1723: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2122, R.dtype("float16")) _1721: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_q_proj_weight, alloc1722, model_encoder_layers_9_self_attn_q_proj_bias, alloc1723) R.vm.kill_object(model_encoder_layers_9_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_9_self_attn_q_proj_bias) gv2123: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape72: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1723, gv2123, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1723) model_encoder_layers_9_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[140] gv2124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1724: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2124, R.dtype("float16")) _1722: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_9_self_attn_k_proj_weight, alloc1722, alloc1724) R.vm.kill_object(model_encoder_layers_9_self_attn_k_proj_weight) gv2125: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape73: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1724, gv2125, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1724) model_encoder_layers_9_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[141] model_encoder_layers_9_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[142] gv2126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1725: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2126, R.dtype("float16")) _1723: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_v_proj_weight, alloc1722, model_encoder_layers_9_self_attn_v_proj_bias, alloc1725) R.vm.kill_object(alloc1722) R.vm.kill_object(model_encoder_layers_9_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_9_self_attn_v_proj_bias) gv2127: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape74: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1725, gv2127, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1725) gv2128: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape75: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape72, gv2128, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape72) gv2129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape76: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape73, gv2129, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape73) gv2130: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape77: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape74, gv2130, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape74) gv2131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1726: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2131, R.dtype("float16")) _1724: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape75, reshape76, reshape77, alloc1726) R.vm.kill_object(reshape75) R.vm.kill_object(reshape76) R.vm.kill_object(reshape77) gv2132: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape78: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1726, gv2132, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1726) gv2133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape79: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape78, gv2133, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape78) model_encoder_layers_9_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[145] model_encoder_layers_9_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[146] gv2134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1727: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2134, R.dtype("float16")) _1725: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_out_proj_weight, reshape79, model_encoder_layers_9_self_attn_out_proj_bias, alloc1727) R.vm.kill_object(reshape79) R.vm.kill_object(model_encoder_layers_9_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_9_self_attn_out_proj_bias) gv2135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1728: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2135, R.dtype("float16")) cls.add4(alloc1721, alloc1727, alloc1728) R.vm.kill_object(alloc1721) R.vm.kill_object(alloc1727) model_encoder_layers_9_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[153] model_encoder_layers_9_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[154] gv2136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1729: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2136, R.dtype("float16")) cls.layer_norm1(alloc1728, model_encoder_layers_9_final_layer_norm_weight, model_encoder_layers_9_final_layer_norm_bias, alloc1729) R.vm.kill_object(model_encoder_layers_9_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_9_final_layer_norm_bias) model_encoder_layers_9_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[149] model_encoder_layers_9_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[150] gv2137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1730: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2137, R.dtype("float16")) _1728: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_9_fc1_weight, alloc1729, model_encoder_layers_9_fc1_bias, alloc1730) R.vm.kill_object(alloc1729) R.vm.kill_object(model_encoder_layers_9_fc1_weight) R.vm.kill_object(model_encoder_layers_9_fc1_bias) model_encoder_layers_9_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[151] model_encoder_layers_9_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[152] gv2138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1731: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2138, R.dtype("float16")) _1729: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_9_fc2_weight, alloc1730, model_encoder_layers_9_fc2_bias, alloc1731) R.vm.kill_object(alloc1730) R.vm.kill_object(model_encoder_layers_9_fc2_weight) R.vm.kill_object(model_encoder_layers_9_fc2_bias) gv2139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1732: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2139, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1728, alloc1731, alloc1732) R.vm.kill_object(alloc1728) R.vm.kill_object(alloc1731) model_encoder_layers_10_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[162] model_encoder_layers_10_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[163] gv2140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1733: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2140, R.dtype("float16")) cls.layer_norm1(alloc1732, model_encoder_layers_10_self_attn_layer_norm_weight, model_encoder_layers_10_self_attn_layer_norm_bias, alloc1733) R.vm.kill_object(model_encoder_layers_10_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_10_self_attn_layer_norm_bias) model_encoder_layers_10_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[158] model_encoder_layers_10_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[159] gv2141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1734: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2141, R.dtype("float16")) _1732: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_q_proj_weight, alloc1733, model_encoder_layers_10_self_attn_q_proj_bias, alloc1734) R.vm.kill_object(model_encoder_layers_10_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_10_self_attn_q_proj_bias) gv2142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape80: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1734, gv2142, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1734) model_encoder_layers_10_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[155] gv2143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1735: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2143, R.dtype("float16")) _1733: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_10_self_attn_k_proj_weight, alloc1733, alloc1735) R.vm.kill_object(model_encoder_layers_10_self_attn_k_proj_weight) gv2144: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape81: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1735, gv2144, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1735) model_encoder_layers_10_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[156] model_encoder_layers_10_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[157] gv2145: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1736: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2145, R.dtype("float16")) _1734: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_v_proj_weight, alloc1733, model_encoder_layers_10_self_attn_v_proj_bias, alloc1736) R.vm.kill_object(alloc1733) R.vm.kill_object(model_encoder_layers_10_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_10_self_attn_v_proj_bias) gv2146: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape82: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1736, gv2146, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1736) gv2147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape83: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape80, gv2147, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape80) gv2148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape84: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape81, gv2148, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape81) gv2149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape85: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape82, gv2149, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape82) gv2150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1737: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2150, R.dtype("float16")) _1735: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape83, reshape84, reshape85, alloc1737) R.vm.kill_object(reshape83) R.vm.kill_object(reshape84) R.vm.kill_object(reshape85) gv2151: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape86: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1737, gv2151, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1737) gv2152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape87: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape86, gv2152, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape86) model_encoder_layers_10_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[160] model_encoder_layers_10_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[161] gv2153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1738: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2153, R.dtype("float16")) _1736: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_out_proj_weight, reshape87, model_encoder_layers_10_self_attn_out_proj_bias, alloc1738) R.vm.kill_object(reshape87) R.vm.kill_object(model_encoder_layers_10_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_10_self_attn_out_proj_bias) gv2154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1739: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2154, R.dtype("float16")) cls.add4(alloc1732, alloc1738, alloc1739) R.vm.kill_object(alloc1732) R.vm.kill_object(alloc1738) model_encoder_layers_10_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[168] model_encoder_layers_10_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[169] gv2155: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1740: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2155, R.dtype("float16")) cls.layer_norm1(alloc1739, model_encoder_layers_10_final_layer_norm_weight, model_encoder_layers_10_final_layer_norm_bias, alloc1740) R.vm.kill_object(model_encoder_layers_10_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_10_final_layer_norm_bias) model_encoder_layers_10_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[164] model_encoder_layers_10_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[165] gv2156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1741: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2156, R.dtype("float16")) _1739: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_10_fc1_weight, alloc1740, model_encoder_layers_10_fc1_bias, alloc1741) R.vm.kill_object(alloc1740) R.vm.kill_object(model_encoder_layers_10_fc1_weight) R.vm.kill_object(model_encoder_layers_10_fc1_bias) model_encoder_layers_10_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[166] model_encoder_layers_10_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[167] gv2157: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1742: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2157, R.dtype("float16")) _1740: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_10_fc2_weight, alloc1741, model_encoder_layers_10_fc2_bias, alloc1742) R.vm.kill_object(alloc1741) R.vm.kill_object(model_encoder_layers_10_fc2_weight) R.vm.kill_object(model_encoder_layers_10_fc2_bias) gv2158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1743: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2158, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1739, alloc1742, alloc1743) R.vm.kill_object(alloc1739) R.vm.kill_object(alloc1742) model_encoder_layers_11_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[177] model_encoder_layers_11_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[178] gv2159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1744: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2159, R.dtype("float16")) cls.layer_norm1(alloc1743, model_encoder_layers_11_self_attn_layer_norm_weight, model_encoder_layers_11_self_attn_layer_norm_bias, alloc1744) R.vm.kill_object(model_encoder_layers_11_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_11_self_attn_layer_norm_bias) model_encoder_layers_11_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[173] model_encoder_layers_11_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[174] gv2160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1745: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2160, R.dtype("float16")) _1743: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_q_proj_weight, alloc1744, model_encoder_layers_11_self_attn_q_proj_bias, alloc1745) R.vm.kill_object(model_encoder_layers_11_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_11_self_attn_q_proj_bias) gv2161: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape88: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1745, gv2161, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1745) model_encoder_layers_11_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[170] gv2162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1746: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2162, R.dtype("float16")) _1744: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_11_self_attn_k_proj_weight, alloc1744, alloc1746) R.vm.kill_object(model_encoder_layers_11_self_attn_k_proj_weight) gv2163: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape89: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1746, gv2163, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1746) model_encoder_layers_11_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[171] model_encoder_layers_11_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[172] gv2164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1747: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2164, R.dtype("float16")) _1745: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_v_proj_weight, alloc1744, model_encoder_layers_11_self_attn_v_proj_bias, alloc1747) R.vm.kill_object(alloc1744) R.vm.kill_object(model_encoder_layers_11_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_11_self_attn_v_proj_bias) gv2165: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape90: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1747, gv2165, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1747) gv2166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape91: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape88, gv2166, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape88) gv2167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape92: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape89, gv2167, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape89) gv2168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape93: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape90, gv2168, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape90) gv2169: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1748: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2169, R.dtype("float16")) _1746: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape91, reshape92, reshape93, alloc1748) R.vm.kill_object(reshape91) R.vm.kill_object(reshape92) R.vm.kill_object(reshape93) gv2170: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape94: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1748, gv2170, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1748) gv2171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape95: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape94, gv2171, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape94) model_encoder_layers_11_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[175] model_encoder_layers_11_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[176] gv2172: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1749: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2172, R.dtype("float16")) _1747: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_out_proj_weight, reshape95, model_encoder_layers_11_self_attn_out_proj_bias, alloc1749) R.vm.kill_object(reshape95) R.vm.kill_object(model_encoder_layers_11_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_11_self_attn_out_proj_bias) gv2173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1750: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2173, R.dtype("float16")) cls.add4(alloc1743, alloc1749, alloc1750) R.vm.kill_object(alloc1743) R.vm.kill_object(alloc1749) model_encoder_layers_11_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[183] model_encoder_layers_11_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[184] gv2174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1751: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2174, R.dtype("float16")) cls.layer_norm1(alloc1750, model_encoder_layers_11_final_layer_norm_weight, model_encoder_layers_11_final_layer_norm_bias, alloc1751) R.vm.kill_object(model_encoder_layers_11_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_11_final_layer_norm_bias) model_encoder_layers_11_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[179] model_encoder_layers_11_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[180] gv2175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1752: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2175, R.dtype("float16")) _1750: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_11_fc1_weight, alloc1751, model_encoder_layers_11_fc1_bias, alloc1752) R.vm.kill_object(alloc1751) R.vm.kill_object(model_encoder_layers_11_fc1_weight) R.vm.kill_object(model_encoder_layers_11_fc1_bias) model_encoder_layers_11_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[181] model_encoder_layers_11_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[182] gv2176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1753: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2176, R.dtype("float16")) _1751: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_11_fc2_weight, alloc1752, model_encoder_layers_11_fc2_bias, alloc1753) R.vm.kill_object(alloc1752) R.vm.kill_object(model_encoder_layers_11_fc2_weight) R.vm.kill_object(model_encoder_layers_11_fc2_bias) gv2177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1754: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2177, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1750, alloc1753, alloc1754) R.vm.kill_object(alloc1750) R.vm.kill_object(alloc1753) model_encoder_layers_12_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[192] model_encoder_layers_12_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[193] gv2178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1755: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2178, R.dtype("float16")) cls.layer_norm1(alloc1754, model_encoder_layers_12_self_attn_layer_norm_weight, model_encoder_layers_12_self_attn_layer_norm_bias, alloc1755) R.vm.kill_object(model_encoder_layers_12_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_12_self_attn_layer_norm_bias) model_encoder_layers_12_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[188] model_encoder_layers_12_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[189] gv2179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1756: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2179, R.dtype("float16")) _1754: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_q_proj_weight, alloc1755, model_encoder_layers_12_self_attn_q_proj_bias, alloc1756) R.vm.kill_object(model_encoder_layers_12_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_12_self_attn_q_proj_bias) gv2180: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape96: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1756, gv2180, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1756) model_encoder_layers_12_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[185] gv2181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1757: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2181, R.dtype("float16")) _1755: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_12_self_attn_k_proj_weight, alloc1755, alloc1757) R.vm.kill_object(model_encoder_layers_12_self_attn_k_proj_weight) gv2182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape97: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1757, gv2182, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1757) model_encoder_layers_12_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[186] model_encoder_layers_12_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[187] gv2183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1758: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2183, R.dtype("float16")) _1756: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_v_proj_weight, alloc1755, model_encoder_layers_12_self_attn_v_proj_bias, alloc1758) R.vm.kill_object(alloc1755) R.vm.kill_object(model_encoder_layers_12_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_12_self_attn_v_proj_bias) gv2184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape98: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1758, gv2184, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1758) gv2185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape99: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape96, gv2185, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape96) gv2186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape100: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape97, gv2186, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape97) gv2187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape101: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape98, gv2187, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape98) gv2188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1759: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2188, R.dtype("float16")) _1757: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape99, reshape100, reshape101, alloc1759) R.vm.kill_object(reshape99) R.vm.kill_object(reshape100) R.vm.kill_object(reshape101) gv2189: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape102: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1759, gv2189, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1759) gv2190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape103: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape102, gv2190, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape102) model_encoder_layers_12_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[190] model_encoder_layers_12_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[191] gv2191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1760: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2191, R.dtype("float16")) _1758: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_out_proj_weight, reshape103, model_encoder_layers_12_self_attn_out_proj_bias, alloc1760) R.vm.kill_object(reshape103) R.vm.kill_object(model_encoder_layers_12_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_12_self_attn_out_proj_bias) gv2192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1761: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2192, R.dtype("float16")) cls.add4(alloc1754, alloc1760, alloc1761) R.vm.kill_object(alloc1754) R.vm.kill_object(alloc1760) model_encoder_layers_12_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[198] model_encoder_layers_12_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[199] gv2193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1762: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2193, R.dtype("float16")) cls.layer_norm1(alloc1761, model_encoder_layers_12_final_layer_norm_weight, model_encoder_layers_12_final_layer_norm_bias, alloc1762) R.vm.kill_object(model_encoder_layers_12_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_12_final_layer_norm_bias) model_encoder_layers_12_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[194] model_encoder_layers_12_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[195] gv2194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1763: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2194, R.dtype("float16")) _1761: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_12_fc1_weight, alloc1762, model_encoder_layers_12_fc1_bias, alloc1763) R.vm.kill_object(alloc1762) R.vm.kill_object(model_encoder_layers_12_fc1_weight) R.vm.kill_object(model_encoder_layers_12_fc1_bias) model_encoder_layers_12_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[196] model_encoder_layers_12_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[197] gv2195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1764: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2195, R.dtype("float16")) _1762: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_12_fc2_weight, alloc1763, model_encoder_layers_12_fc2_bias, alloc1764) R.vm.kill_object(alloc1763) R.vm.kill_object(model_encoder_layers_12_fc2_weight) R.vm.kill_object(model_encoder_layers_12_fc2_bias) gv2196: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1765: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2196, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1761, alloc1764, alloc1765) R.vm.kill_object(alloc1761) R.vm.kill_object(alloc1764) model_encoder_layers_13_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[207] model_encoder_layers_13_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[208] gv2197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1766: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2197, R.dtype("float16")) cls.layer_norm1(alloc1765, model_encoder_layers_13_self_attn_layer_norm_weight, model_encoder_layers_13_self_attn_layer_norm_bias, alloc1766) R.vm.kill_object(model_encoder_layers_13_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_13_self_attn_layer_norm_bias) model_encoder_layers_13_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[203] model_encoder_layers_13_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[204] gv2198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1767: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2198, R.dtype("float16")) _1765: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_q_proj_weight, alloc1766, model_encoder_layers_13_self_attn_q_proj_bias, alloc1767) R.vm.kill_object(model_encoder_layers_13_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_13_self_attn_q_proj_bias) gv2199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape104: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1767, gv2199, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1767) model_encoder_layers_13_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[200] gv2200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1768: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2200, R.dtype("float16")) _1766: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_13_self_attn_k_proj_weight, alloc1766, alloc1768) R.vm.kill_object(model_encoder_layers_13_self_attn_k_proj_weight) gv2201: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape105: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1768, gv2201, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1768) model_encoder_layers_13_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[201] model_encoder_layers_13_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[202] gv2202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1769: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2202, R.dtype("float16")) _1767: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_v_proj_weight, alloc1766, model_encoder_layers_13_self_attn_v_proj_bias, alloc1769) R.vm.kill_object(alloc1766) R.vm.kill_object(model_encoder_layers_13_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_13_self_attn_v_proj_bias) gv2203: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape106: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1769, gv2203, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1769) gv2204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape107: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape104, gv2204, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape104) gv2205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape108: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape105, gv2205, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape105) gv2206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape109: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape106, gv2206, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape106) gv2207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1770: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2207, R.dtype("float16")) _1768: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape107, reshape108, reshape109, alloc1770) R.vm.kill_object(reshape107) R.vm.kill_object(reshape108) R.vm.kill_object(reshape109) gv2208: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape110: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1770, gv2208, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1770) gv2209: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape111: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape110, gv2209, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape110) model_encoder_layers_13_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[205] model_encoder_layers_13_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[206] gv2210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1771: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2210, R.dtype("float16")) _1769: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_out_proj_weight, reshape111, model_encoder_layers_13_self_attn_out_proj_bias, alloc1771) R.vm.kill_object(reshape111) R.vm.kill_object(model_encoder_layers_13_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_13_self_attn_out_proj_bias) gv2211: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1772: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2211, R.dtype("float16")) cls.add4(alloc1765, alloc1771, alloc1772) R.vm.kill_object(alloc1765) R.vm.kill_object(alloc1771) model_encoder_layers_13_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[213] model_encoder_layers_13_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[214] gv2212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1773: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2212, R.dtype("float16")) cls.layer_norm1(alloc1772, model_encoder_layers_13_final_layer_norm_weight, model_encoder_layers_13_final_layer_norm_bias, alloc1773) R.vm.kill_object(model_encoder_layers_13_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_13_final_layer_norm_bias) model_encoder_layers_13_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[209] model_encoder_layers_13_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[210] gv2213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1774: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2213, R.dtype("float16")) _1772: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_13_fc1_weight, alloc1773, model_encoder_layers_13_fc1_bias, alloc1774) R.vm.kill_object(alloc1773) R.vm.kill_object(model_encoder_layers_13_fc1_weight) R.vm.kill_object(model_encoder_layers_13_fc1_bias) model_encoder_layers_13_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[211] model_encoder_layers_13_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[212] gv2214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1775: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2214, R.dtype("float16")) _1773: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_13_fc2_weight, alloc1774, model_encoder_layers_13_fc2_bias, alloc1775) R.vm.kill_object(alloc1774) R.vm.kill_object(model_encoder_layers_13_fc2_weight) R.vm.kill_object(model_encoder_layers_13_fc2_bias) gv2215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1776: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2215, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1772, alloc1775, alloc1776) R.vm.kill_object(alloc1772) R.vm.kill_object(alloc1775) model_encoder_layers_14_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[222] model_encoder_layers_14_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[223] gv2216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1777: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2216, R.dtype("float16")) cls.layer_norm1(alloc1776, model_encoder_layers_14_self_attn_layer_norm_weight, model_encoder_layers_14_self_attn_layer_norm_bias, alloc1777) R.vm.kill_object(model_encoder_layers_14_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_14_self_attn_layer_norm_bias) model_encoder_layers_14_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[218] model_encoder_layers_14_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[219] gv2217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1778: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2217, R.dtype("float16")) _1776: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_q_proj_weight, alloc1777, model_encoder_layers_14_self_attn_q_proj_bias, alloc1778) R.vm.kill_object(model_encoder_layers_14_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_14_self_attn_q_proj_bias) gv2218: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape112: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1778, gv2218, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1778) model_encoder_layers_14_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[215] gv2219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1779: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2219, R.dtype("float16")) _1777: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_14_self_attn_k_proj_weight, alloc1777, alloc1779) R.vm.kill_object(model_encoder_layers_14_self_attn_k_proj_weight) gv2220: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape113: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1779, gv2220, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1779) model_encoder_layers_14_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[216] model_encoder_layers_14_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[217] gv2221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1780: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2221, R.dtype("float16")) _1778: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_v_proj_weight, alloc1777, model_encoder_layers_14_self_attn_v_proj_bias, alloc1780) R.vm.kill_object(alloc1777) R.vm.kill_object(model_encoder_layers_14_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_14_self_attn_v_proj_bias) gv2222: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape114: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1780, gv2222, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1780) gv2223: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape115: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape112, gv2223, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape112) gv2224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape116: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape113, gv2224, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape113) gv2225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape117: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape114, gv2225, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape114) gv2226: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1781: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2226, R.dtype("float16")) _1779: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape115, reshape116, reshape117, alloc1781) R.vm.kill_object(reshape115) R.vm.kill_object(reshape116) R.vm.kill_object(reshape117) gv2227: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape118: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1781, gv2227, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1781) gv2228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape119: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape118, gv2228, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape118) model_encoder_layers_14_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[220] model_encoder_layers_14_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[221] gv2229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1782: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2229, R.dtype("float16")) _1780: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_out_proj_weight, reshape119, model_encoder_layers_14_self_attn_out_proj_bias, alloc1782) R.vm.kill_object(reshape119) R.vm.kill_object(model_encoder_layers_14_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_14_self_attn_out_proj_bias) gv2230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1783: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2230, R.dtype("float16")) cls.add4(alloc1776, alloc1782, alloc1783) R.vm.kill_object(alloc1776) R.vm.kill_object(alloc1782) model_encoder_layers_14_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[228] model_encoder_layers_14_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[229] gv2231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1784: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2231, R.dtype("float16")) cls.layer_norm1(alloc1783, model_encoder_layers_14_final_layer_norm_weight, model_encoder_layers_14_final_layer_norm_bias, alloc1784) R.vm.kill_object(model_encoder_layers_14_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_14_final_layer_norm_bias) model_encoder_layers_14_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[224] model_encoder_layers_14_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[225] gv2232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1785: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2232, R.dtype("float16")) _1783: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_14_fc1_weight, alloc1784, model_encoder_layers_14_fc1_bias, alloc1785) R.vm.kill_object(alloc1784) R.vm.kill_object(model_encoder_layers_14_fc1_weight) R.vm.kill_object(model_encoder_layers_14_fc1_bias) model_encoder_layers_14_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[226] model_encoder_layers_14_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[227] gv2233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1786: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2233, R.dtype("float16")) _1784: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_14_fc2_weight, alloc1785, model_encoder_layers_14_fc2_bias, alloc1786) R.vm.kill_object(alloc1785) R.vm.kill_object(model_encoder_layers_14_fc2_weight) R.vm.kill_object(model_encoder_layers_14_fc2_bias) gv2234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1787: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2234, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1783, alloc1786, alloc1787) R.vm.kill_object(alloc1783) R.vm.kill_object(alloc1786) model_encoder_layers_15_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[237] model_encoder_layers_15_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[238] gv2235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1788: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2235, R.dtype("float16")) cls.layer_norm1(alloc1787, model_encoder_layers_15_self_attn_layer_norm_weight, model_encoder_layers_15_self_attn_layer_norm_bias, alloc1788) R.vm.kill_object(model_encoder_layers_15_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_15_self_attn_layer_norm_bias) model_encoder_layers_15_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[233] model_encoder_layers_15_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[234] gv2236: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1789: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2236, R.dtype("float16")) _1787: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_q_proj_weight, alloc1788, model_encoder_layers_15_self_attn_q_proj_bias, alloc1789) R.vm.kill_object(model_encoder_layers_15_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_15_self_attn_q_proj_bias) gv2237: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape120: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1789, gv2237, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1789) model_encoder_layers_15_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[230] gv2238: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1790: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2238, R.dtype("float16")) _1788: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_15_self_attn_k_proj_weight, alloc1788, alloc1790) R.vm.kill_object(model_encoder_layers_15_self_attn_k_proj_weight) gv2239: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape121: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1790, gv2239, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1790) model_encoder_layers_15_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[231] model_encoder_layers_15_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[232] gv2240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1791: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2240, R.dtype("float16")) _1789: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_v_proj_weight, alloc1788, model_encoder_layers_15_self_attn_v_proj_bias, alloc1791) R.vm.kill_object(alloc1788) R.vm.kill_object(model_encoder_layers_15_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_15_self_attn_v_proj_bias) gv2241: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape122: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1791, gv2241, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1791) gv2242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape123: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape120, gv2242, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape120) gv2243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape124: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape121, gv2243, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape121) gv2244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape125: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape122, gv2244, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape122) gv2245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1792: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2245, R.dtype("float16")) _1790: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape123, reshape124, reshape125, alloc1792) R.vm.kill_object(reshape123) R.vm.kill_object(reshape124) R.vm.kill_object(reshape125) gv2246: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape126: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1792, gv2246, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1792) gv2247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape127: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape126, gv2247, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape126) model_encoder_layers_15_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[235] model_encoder_layers_15_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[236] gv2248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1793: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2248, R.dtype("float16")) _1791: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_out_proj_weight, reshape127, model_encoder_layers_15_self_attn_out_proj_bias, alloc1793) R.vm.kill_object(reshape127) R.vm.kill_object(model_encoder_layers_15_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_15_self_attn_out_proj_bias) gv2249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1794: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2249, R.dtype("float16")) cls.add4(alloc1787, alloc1793, alloc1794) R.vm.kill_object(alloc1787) R.vm.kill_object(alloc1793) model_encoder_layers_15_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[243] model_encoder_layers_15_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[244] gv2250: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1795: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2250, R.dtype("float16")) cls.layer_norm1(alloc1794, model_encoder_layers_15_final_layer_norm_weight, model_encoder_layers_15_final_layer_norm_bias, alloc1795) R.vm.kill_object(model_encoder_layers_15_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_15_final_layer_norm_bias) model_encoder_layers_15_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[239] model_encoder_layers_15_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[240] gv2251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1796: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2251, R.dtype("float16")) _1794: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_15_fc1_weight, alloc1795, model_encoder_layers_15_fc1_bias, alloc1796) R.vm.kill_object(alloc1795) R.vm.kill_object(model_encoder_layers_15_fc1_weight) R.vm.kill_object(model_encoder_layers_15_fc1_bias) model_encoder_layers_15_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[241] model_encoder_layers_15_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[242] gv2252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1797: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2252, R.dtype("float16")) _1795: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_15_fc2_weight, alloc1796, model_encoder_layers_15_fc2_bias, alloc1797) R.vm.kill_object(alloc1796) R.vm.kill_object(model_encoder_layers_15_fc2_weight) R.vm.kill_object(model_encoder_layers_15_fc2_bias) gv2253: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1798: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2253, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1794, alloc1797, alloc1798) R.vm.kill_object(alloc1794) R.vm.kill_object(alloc1797) model_encoder_layers_16_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[252] model_encoder_layers_16_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[253] gv2254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1799: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2254, R.dtype("float16")) cls.layer_norm1(alloc1798, model_encoder_layers_16_self_attn_layer_norm_weight, model_encoder_layers_16_self_attn_layer_norm_bias, alloc1799) R.vm.kill_object(model_encoder_layers_16_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_16_self_attn_layer_norm_bias) model_encoder_layers_16_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[248] model_encoder_layers_16_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[249] gv2255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1800: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2255, R.dtype("float16")) _1798: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_q_proj_weight, alloc1799, model_encoder_layers_16_self_attn_q_proj_bias, alloc1800) R.vm.kill_object(model_encoder_layers_16_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_16_self_attn_q_proj_bias) gv2256: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape128: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1800, gv2256, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1800) model_encoder_layers_16_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[245] gv2257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1801: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2257, R.dtype("float16")) _1799: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_16_self_attn_k_proj_weight, alloc1799, alloc1801) R.vm.kill_object(model_encoder_layers_16_self_attn_k_proj_weight) gv2258: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape129: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1801, gv2258, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1801) model_encoder_layers_16_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[246] model_encoder_layers_16_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[247] gv2259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1802: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2259, R.dtype("float16")) _1800: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_v_proj_weight, alloc1799, model_encoder_layers_16_self_attn_v_proj_bias, alloc1802) R.vm.kill_object(alloc1799) R.vm.kill_object(model_encoder_layers_16_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_16_self_attn_v_proj_bias) gv2260: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape130: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1802, gv2260, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1802) gv2261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape131: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape128, gv2261, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape128) gv2262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape132: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape129, gv2262, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape129) gv2263: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape133: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape130, gv2263, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape130) gv2264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1803: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2264, R.dtype("float16")) _1801: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape131, reshape132, reshape133, alloc1803) R.vm.kill_object(reshape131) R.vm.kill_object(reshape132) R.vm.kill_object(reshape133) gv2265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape134: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1803, gv2265, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1803) gv2266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape135: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape134, gv2266, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape134) model_encoder_layers_16_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[250] model_encoder_layers_16_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[251] gv2267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1804: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2267, R.dtype("float16")) _1802: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_out_proj_weight, reshape135, model_encoder_layers_16_self_attn_out_proj_bias, alloc1804) R.vm.kill_object(reshape135) R.vm.kill_object(model_encoder_layers_16_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_16_self_attn_out_proj_bias) gv2268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1805: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2268, R.dtype("float16")) cls.add4(alloc1798, alloc1804, alloc1805) R.vm.kill_object(alloc1798) R.vm.kill_object(alloc1804) model_encoder_layers_16_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[258] model_encoder_layers_16_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[259] gv2269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1806: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2269, R.dtype("float16")) cls.layer_norm1(alloc1805, model_encoder_layers_16_final_layer_norm_weight, model_encoder_layers_16_final_layer_norm_bias, alloc1806) R.vm.kill_object(model_encoder_layers_16_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_16_final_layer_norm_bias) model_encoder_layers_16_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[254] model_encoder_layers_16_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[255] gv2270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1807: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2270, R.dtype("float16")) _1805: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_16_fc1_weight, alloc1806, model_encoder_layers_16_fc1_bias, alloc1807) R.vm.kill_object(alloc1806) R.vm.kill_object(model_encoder_layers_16_fc1_weight) R.vm.kill_object(model_encoder_layers_16_fc1_bias) model_encoder_layers_16_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[256] model_encoder_layers_16_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[257] gv2271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1808: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2271, R.dtype("float16")) _1806: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_16_fc2_weight, alloc1807, model_encoder_layers_16_fc2_bias, alloc1808) R.vm.kill_object(alloc1807) R.vm.kill_object(model_encoder_layers_16_fc2_weight) R.vm.kill_object(model_encoder_layers_16_fc2_bias) gv2272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1809: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2272, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1805, alloc1808, alloc1809) R.vm.kill_object(alloc1805) R.vm.kill_object(alloc1808) model_encoder_layers_17_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[267] model_encoder_layers_17_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[268] gv2273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1810: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2273, R.dtype("float16")) cls.layer_norm1(alloc1809, model_encoder_layers_17_self_attn_layer_norm_weight, model_encoder_layers_17_self_attn_layer_norm_bias, alloc1810) R.vm.kill_object(model_encoder_layers_17_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_17_self_attn_layer_norm_bias) model_encoder_layers_17_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[263] model_encoder_layers_17_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[264] gv2274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1811: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2274, R.dtype("float16")) _1809: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_q_proj_weight, alloc1810, model_encoder_layers_17_self_attn_q_proj_bias, alloc1811) R.vm.kill_object(model_encoder_layers_17_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_17_self_attn_q_proj_bias) gv2275: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape136: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1811, gv2275, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1811) model_encoder_layers_17_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[260] gv2276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1812: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2276, R.dtype("float16")) _1810: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_17_self_attn_k_proj_weight, alloc1810, alloc1812) R.vm.kill_object(model_encoder_layers_17_self_attn_k_proj_weight) gv2277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape137: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1812, gv2277, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1812) model_encoder_layers_17_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[261] model_encoder_layers_17_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[262] gv2278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1813: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2278, R.dtype("float16")) _1811: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_v_proj_weight, alloc1810, model_encoder_layers_17_self_attn_v_proj_bias, alloc1813) R.vm.kill_object(alloc1810) R.vm.kill_object(model_encoder_layers_17_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_17_self_attn_v_proj_bias) gv2279: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape138: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1813, gv2279, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1813) gv2280: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape139: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape136, gv2280, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape136) gv2281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape140: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape137, gv2281, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape137) gv2282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape141: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape138, gv2282, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape138) gv2283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1814: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2283, R.dtype("float16")) _1812: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape139, reshape140, reshape141, alloc1814) R.vm.kill_object(reshape139) R.vm.kill_object(reshape140) R.vm.kill_object(reshape141) gv2284: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape142: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1814, gv2284, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1814) gv2285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape143: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape142, gv2285, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape142) model_encoder_layers_17_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[265] model_encoder_layers_17_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[266] gv2286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1815: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2286, R.dtype("float16")) _1813: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_out_proj_weight, reshape143, model_encoder_layers_17_self_attn_out_proj_bias, alloc1815) R.vm.kill_object(reshape143) R.vm.kill_object(model_encoder_layers_17_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_17_self_attn_out_proj_bias) gv2287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1816: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2287, R.dtype("float16")) cls.add4(alloc1809, alloc1815, alloc1816) R.vm.kill_object(alloc1809) R.vm.kill_object(alloc1815) model_encoder_layers_17_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[273] model_encoder_layers_17_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[274] gv2288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1817: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2288, R.dtype("float16")) cls.layer_norm1(alloc1816, model_encoder_layers_17_final_layer_norm_weight, model_encoder_layers_17_final_layer_norm_bias, alloc1817) R.vm.kill_object(model_encoder_layers_17_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_17_final_layer_norm_bias) model_encoder_layers_17_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[269] model_encoder_layers_17_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[270] gv2289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1818: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2289, R.dtype("float16")) _1816: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_17_fc1_weight, alloc1817, model_encoder_layers_17_fc1_bias, alloc1818) R.vm.kill_object(alloc1817) R.vm.kill_object(model_encoder_layers_17_fc1_weight) R.vm.kill_object(model_encoder_layers_17_fc1_bias) model_encoder_layers_17_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[271] model_encoder_layers_17_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[272] gv2290: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1819: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2290, R.dtype("float16")) _1817: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_17_fc2_weight, alloc1818, model_encoder_layers_17_fc2_bias, alloc1819) R.vm.kill_object(alloc1818) R.vm.kill_object(model_encoder_layers_17_fc2_weight) R.vm.kill_object(model_encoder_layers_17_fc2_bias) gv2291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1820: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2291, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1816, alloc1819, alloc1820) R.vm.kill_object(alloc1816) R.vm.kill_object(alloc1819) model_encoder_layers_18_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[282] model_encoder_layers_18_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[283] gv2292: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1821: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2292, R.dtype("float16")) cls.layer_norm1(alloc1820, model_encoder_layers_18_self_attn_layer_norm_weight, model_encoder_layers_18_self_attn_layer_norm_bias, alloc1821) R.vm.kill_object(model_encoder_layers_18_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_18_self_attn_layer_norm_bias) model_encoder_layers_18_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[278] model_encoder_layers_18_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[279] gv2293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1822: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2293, R.dtype("float16")) _1820: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_q_proj_weight, alloc1821, model_encoder_layers_18_self_attn_q_proj_bias, alloc1822) R.vm.kill_object(model_encoder_layers_18_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_18_self_attn_q_proj_bias) gv2294: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape144: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1822, gv2294, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1822) model_encoder_layers_18_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[275] gv2295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1823: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2295, R.dtype("float16")) _1821: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_18_self_attn_k_proj_weight, alloc1821, alloc1823) R.vm.kill_object(model_encoder_layers_18_self_attn_k_proj_weight) gv2296: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape145: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1823, gv2296, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1823) model_encoder_layers_18_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[276] model_encoder_layers_18_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[277] gv2297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1824: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2297, R.dtype("float16")) _1822: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_v_proj_weight, alloc1821, model_encoder_layers_18_self_attn_v_proj_bias, alloc1824) R.vm.kill_object(alloc1821) R.vm.kill_object(model_encoder_layers_18_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_18_self_attn_v_proj_bias) gv2298: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape146: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1824, gv2298, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1824) gv2299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape147: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape144, gv2299, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape144) gv2300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape148: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape145, gv2300, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape145) gv2301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape149: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape146, gv2301, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape146) gv2302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1825: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2302, R.dtype("float16")) _1823: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape147, reshape148, reshape149, alloc1825) R.vm.kill_object(reshape147) R.vm.kill_object(reshape148) R.vm.kill_object(reshape149) gv2303: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape150: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1825, gv2303, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1825) gv2304: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape151: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape150, gv2304, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape150) model_encoder_layers_18_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[280] model_encoder_layers_18_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[281] gv2305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1826: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2305, R.dtype("float16")) _1824: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_out_proj_weight, reshape151, model_encoder_layers_18_self_attn_out_proj_bias, alloc1826) R.vm.kill_object(reshape151) R.vm.kill_object(model_encoder_layers_18_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_18_self_attn_out_proj_bias) gv2306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1827: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2306, R.dtype("float16")) cls.add4(alloc1820, alloc1826, alloc1827) R.vm.kill_object(alloc1820) R.vm.kill_object(alloc1826) model_encoder_layers_18_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[288] model_encoder_layers_18_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[289] gv2307: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1828: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2307, R.dtype("float16")) cls.layer_norm1(alloc1827, model_encoder_layers_18_final_layer_norm_weight, model_encoder_layers_18_final_layer_norm_bias, alloc1828) R.vm.kill_object(model_encoder_layers_18_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_18_final_layer_norm_bias) model_encoder_layers_18_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[284] model_encoder_layers_18_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[285] gv2308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1829: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2308, R.dtype("float16")) _1827: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_18_fc1_weight, alloc1828, model_encoder_layers_18_fc1_bias, alloc1829) R.vm.kill_object(alloc1828) R.vm.kill_object(model_encoder_layers_18_fc1_weight) R.vm.kill_object(model_encoder_layers_18_fc1_bias) model_encoder_layers_18_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[286] model_encoder_layers_18_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[287] gv2309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1830: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2309, R.dtype("float16")) _1828: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_18_fc2_weight, alloc1829, model_encoder_layers_18_fc2_bias, alloc1830) R.vm.kill_object(alloc1829) R.vm.kill_object(model_encoder_layers_18_fc2_weight) R.vm.kill_object(model_encoder_layers_18_fc2_bias) gv2310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1831: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2310, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1827, alloc1830, alloc1831) R.vm.kill_object(alloc1827) R.vm.kill_object(alloc1830) model_encoder_layers_19_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[297] model_encoder_layers_19_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[298] gv2311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1832: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2311, R.dtype("float16")) cls.layer_norm1(alloc1831, model_encoder_layers_19_self_attn_layer_norm_weight, model_encoder_layers_19_self_attn_layer_norm_bias, alloc1832) R.vm.kill_object(model_encoder_layers_19_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_19_self_attn_layer_norm_bias) model_encoder_layers_19_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[293] model_encoder_layers_19_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[294] gv2312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1833: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2312, R.dtype("float16")) _1831: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_q_proj_weight, alloc1832, model_encoder_layers_19_self_attn_q_proj_bias, alloc1833) R.vm.kill_object(model_encoder_layers_19_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_19_self_attn_q_proj_bias) gv2313: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape152: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1833, gv2313, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1833) model_encoder_layers_19_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[290] gv2314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1834: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2314, R.dtype("float16")) _1832: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_19_self_attn_k_proj_weight, alloc1832, alloc1834) R.vm.kill_object(model_encoder_layers_19_self_attn_k_proj_weight) gv2315: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape153: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1834, gv2315, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1834) model_encoder_layers_19_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[291] model_encoder_layers_19_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[292] gv2316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1835: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2316, R.dtype("float16")) _1833: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_v_proj_weight, alloc1832, model_encoder_layers_19_self_attn_v_proj_bias, alloc1835) R.vm.kill_object(alloc1832) R.vm.kill_object(model_encoder_layers_19_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_19_self_attn_v_proj_bias) gv2317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape154: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1835, gv2317, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1835) gv2318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape155: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape152, gv2318, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape152) gv2319: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape156: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape153, gv2319, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape153) gv2320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape157: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape154, gv2320, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape154) gv2321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1836: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2321, R.dtype("float16")) _1834: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape155, reshape156, reshape157, alloc1836) R.vm.kill_object(reshape155) R.vm.kill_object(reshape156) R.vm.kill_object(reshape157) gv2322: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape158: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1836, gv2322, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1836) gv2323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape159: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape158, gv2323, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape158) model_encoder_layers_19_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[295] model_encoder_layers_19_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[296] gv2324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1837: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2324, R.dtype("float16")) _1835: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_out_proj_weight, reshape159, model_encoder_layers_19_self_attn_out_proj_bias, alloc1837) R.vm.kill_object(reshape159) R.vm.kill_object(model_encoder_layers_19_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_19_self_attn_out_proj_bias) gv2325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1838: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2325, R.dtype("float16")) cls.add4(alloc1831, alloc1837, alloc1838) R.vm.kill_object(alloc1831) R.vm.kill_object(alloc1837) model_encoder_layers_19_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[303] model_encoder_layers_19_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[304] gv2326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1839: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2326, R.dtype("float16")) cls.layer_norm1(alloc1838, model_encoder_layers_19_final_layer_norm_weight, model_encoder_layers_19_final_layer_norm_bias, alloc1839) R.vm.kill_object(model_encoder_layers_19_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_19_final_layer_norm_bias) model_encoder_layers_19_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[299] model_encoder_layers_19_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[300] gv2327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1840: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2327, R.dtype("float16")) _1838: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_19_fc1_weight, alloc1839, model_encoder_layers_19_fc1_bias, alloc1840) R.vm.kill_object(alloc1839) R.vm.kill_object(model_encoder_layers_19_fc1_weight) R.vm.kill_object(model_encoder_layers_19_fc1_bias) model_encoder_layers_19_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[301] model_encoder_layers_19_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[302] gv2328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1841: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2328, R.dtype("float16")) _1839: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_19_fc2_weight, alloc1840, model_encoder_layers_19_fc2_bias, alloc1841) R.vm.kill_object(alloc1840) R.vm.kill_object(model_encoder_layers_19_fc2_weight) R.vm.kill_object(model_encoder_layers_19_fc2_bias) gv2329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1842: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2329, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1838, alloc1841, alloc1842) R.vm.kill_object(alloc1838) R.vm.kill_object(alloc1841) model_encoder_layers_20_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[312] model_encoder_layers_20_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[313] gv2330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1843: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2330, R.dtype("float16")) cls.layer_norm1(alloc1842, model_encoder_layers_20_self_attn_layer_norm_weight, model_encoder_layers_20_self_attn_layer_norm_bias, alloc1843) R.vm.kill_object(model_encoder_layers_20_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_20_self_attn_layer_norm_bias) model_encoder_layers_20_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[308] model_encoder_layers_20_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[309] gv2331: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1844: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2331, R.dtype("float16")) _1842: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_q_proj_weight, alloc1843, model_encoder_layers_20_self_attn_q_proj_bias, alloc1844) R.vm.kill_object(model_encoder_layers_20_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_20_self_attn_q_proj_bias) gv2332: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape160: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1844, gv2332, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1844) model_encoder_layers_20_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[305] gv2333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1845: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2333, R.dtype("float16")) _1843: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_20_self_attn_k_proj_weight, alloc1843, alloc1845) R.vm.kill_object(model_encoder_layers_20_self_attn_k_proj_weight) gv2334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape161: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1845, gv2334, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1845) model_encoder_layers_20_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[306] model_encoder_layers_20_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[307] gv2335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1846: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2335, R.dtype("float16")) _1844: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_v_proj_weight, alloc1843, model_encoder_layers_20_self_attn_v_proj_bias, alloc1846) R.vm.kill_object(alloc1843) R.vm.kill_object(model_encoder_layers_20_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_20_self_attn_v_proj_bias) gv2336: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape162: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1846, gv2336, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1846) gv2337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape163: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape160, gv2337, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape160) gv2338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape164: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape161, gv2338, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape161) gv2339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape165: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape162, gv2339, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape162) gv2340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1847: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2340, R.dtype("float16")) _1845: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape163, reshape164, reshape165, alloc1847) R.vm.kill_object(reshape163) R.vm.kill_object(reshape164) R.vm.kill_object(reshape165) gv2341: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape166: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1847, gv2341, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1847) gv2342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape167: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape166, gv2342, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape166) model_encoder_layers_20_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[310] model_encoder_layers_20_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[311] gv2343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1848: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2343, R.dtype("float16")) _1846: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_out_proj_weight, reshape167, model_encoder_layers_20_self_attn_out_proj_bias, alloc1848) R.vm.kill_object(reshape167) R.vm.kill_object(model_encoder_layers_20_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_20_self_attn_out_proj_bias) gv2344: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1849: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2344, R.dtype("float16")) cls.add4(alloc1842, alloc1848, alloc1849) R.vm.kill_object(alloc1842) R.vm.kill_object(alloc1848) model_encoder_layers_20_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[318] model_encoder_layers_20_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[319] gv2345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1850: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2345, R.dtype("float16")) cls.layer_norm1(alloc1849, model_encoder_layers_20_final_layer_norm_weight, model_encoder_layers_20_final_layer_norm_bias, alloc1850) R.vm.kill_object(model_encoder_layers_20_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_20_final_layer_norm_bias) model_encoder_layers_20_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[314] model_encoder_layers_20_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[315] gv2346: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1851: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2346, R.dtype("float16")) _1849: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_20_fc1_weight, alloc1850, model_encoder_layers_20_fc1_bias, alloc1851) R.vm.kill_object(alloc1850) R.vm.kill_object(model_encoder_layers_20_fc1_weight) R.vm.kill_object(model_encoder_layers_20_fc1_bias) model_encoder_layers_20_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[316] model_encoder_layers_20_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[317] gv2347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1852: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2347, R.dtype("float16")) _1850: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_20_fc2_weight, alloc1851, model_encoder_layers_20_fc2_bias, alloc1852) R.vm.kill_object(alloc1851) R.vm.kill_object(model_encoder_layers_20_fc2_weight) R.vm.kill_object(model_encoder_layers_20_fc2_bias) gv2348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1853: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2348, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1849, alloc1852, alloc1853) R.vm.kill_object(alloc1849) R.vm.kill_object(alloc1852) model_encoder_layers_21_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[327] model_encoder_layers_21_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[328] gv2349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1854: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2349, R.dtype("float16")) cls.layer_norm1(alloc1853, model_encoder_layers_21_self_attn_layer_norm_weight, model_encoder_layers_21_self_attn_layer_norm_bias, alloc1854) R.vm.kill_object(model_encoder_layers_21_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_21_self_attn_layer_norm_bias) model_encoder_layers_21_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[323] model_encoder_layers_21_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[324] gv2350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1855: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2350, R.dtype("float16")) _1853: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_q_proj_weight, alloc1854, model_encoder_layers_21_self_attn_q_proj_bias, alloc1855) R.vm.kill_object(model_encoder_layers_21_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_21_self_attn_q_proj_bias) gv2351: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape168: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1855, gv2351, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1855) model_encoder_layers_21_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[320] gv2352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1856: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2352, R.dtype("float16")) _1854: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_21_self_attn_k_proj_weight, alloc1854, alloc1856) R.vm.kill_object(model_encoder_layers_21_self_attn_k_proj_weight) gv2353: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape169: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1856, gv2353, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1856) model_encoder_layers_21_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[321] model_encoder_layers_21_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[322] gv2354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1857: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2354, R.dtype("float16")) _1855: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_v_proj_weight, alloc1854, model_encoder_layers_21_self_attn_v_proj_bias, alloc1857) R.vm.kill_object(alloc1854) R.vm.kill_object(model_encoder_layers_21_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_21_self_attn_v_proj_bias) gv2355: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape170: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1857, gv2355, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1857) gv2356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape171: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape168, gv2356, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape168) gv2357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape172: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape169, gv2357, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape169) gv2358: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape173: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape170, gv2358, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape170) gv2359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1858: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2359, R.dtype("float16")) _1856: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape171, reshape172, reshape173, alloc1858) R.vm.kill_object(reshape171) R.vm.kill_object(reshape172) R.vm.kill_object(reshape173) gv2360: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape174: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1858, gv2360, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1858) gv2361: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape175: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape174, gv2361, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape174) model_encoder_layers_21_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[325] model_encoder_layers_21_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[326] gv2362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1859: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2362, R.dtype("float16")) _1857: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_out_proj_weight, reshape175, model_encoder_layers_21_self_attn_out_proj_bias, alloc1859) R.vm.kill_object(reshape175) R.vm.kill_object(model_encoder_layers_21_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_21_self_attn_out_proj_bias) gv2363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1860: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2363, R.dtype("float16")) cls.add4(alloc1853, alloc1859, alloc1860) R.vm.kill_object(alloc1853) R.vm.kill_object(alloc1859) model_encoder_layers_21_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[333] model_encoder_layers_21_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[334] gv2364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1861: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2364, R.dtype("float16")) cls.layer_norm1(alloc1860, model_encoder_layers_21_final_layer_norm_weight, model_encoder_layers_21_final_layer_norm_bias, alloc1861) R.vm.kill_object(model_encoder_layers_21_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_21_final_layer_norm_bias) model_encoder_layers_21_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[329] model_encoder_layers_21_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[330] gv2365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1862: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2365, R.dtype("float16")) _1860: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_21_fc1_weight, alloc1861, model_encoder_layers_21_fc1_bias, alloc1862) R.vm.kill_object(alloc1861) R.vm.kill_object(model_encoder_layers_21_fc1_weight) R.vm.kill_object(model_encoder_layers_21_fc1_bias) model_encoder_layers_21_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[331] model_encoder_layers_21_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[332] gv2366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1863: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2366, R.dtype("float16")) _1861: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_21_fc2_weight, alloc1862, model_encoder_layers_21_fc2_bias, alloc1863) R.vm.kill_object(alloc1862) R.vm.kill_object(model_encoder_layers_21_fc2_weight) R.vm.kill_object(model_encoder_layers_21_fc2_bias) gv2367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1864: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2367, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1860, alloc1863, alloc1864) R.vm.kill_object(alloc1860) R.vm.kill_object(alloc1863) model_encoder_layers_22_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[342] model_encoder_layers_22_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[343] gv2368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1865: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2368, R.dtype("float16")) cls.layer_norm1(alloc1864, model_encoder_layers_22_self_attn_layer_norm_weight, model_encoder_layers_22_self_attn_layer_norm_bias, alloc1865) R.vm.kill_object(model_encoder_layers_22_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_22_self_attn_layer_norm_bias) model_encoder_layers_22_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[338] model_encoder_layers_22_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[339] gv2369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1866: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2369, R.dtype("float16")) _1864: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_q_proj_weight, alloc1865, model_encoder_layers_22_self_attn_q_proj_bias, alloc1866) R.vm.kill_object(model_encoder_layers_22_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_22_self_attn_q_proj_bias) gv2370: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape176: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1866, gv2370, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1866) model_encoder_layers_22_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[335] gv2371: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1867: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2371, R.dtype("float16")) _1865: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_22_self_attn_k_proj_weight, alloc1865, alloc1867) R.vm.kill_object(model_encoder_layers_22_self_attn_k_proj_weight) gv2372: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape177: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1867, gv2372, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1867) model_encoder_layers_22_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[336] model_encoder_layers_22_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[337] gv2373: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1868: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2373, R.dtype("float16")) _1866: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_v_proj_weight, alloc1865, model_encoder_layers_22_self_attn_v_proj_bias, alloc1868) R.vm.kill_object(alloc1865) R.vm.kill_object(model_encoder_layers_22_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_22_self_attn_v_proj_bias) gv2374: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape178: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1868, gv2374, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1868) gv2375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape179: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape176, gv2375, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape176) gv2376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape180: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape177, gv2376, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape177) gv2377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape181: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape178, gv2377, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape178) gv2378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1869: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2378, R.dtype("float16")) _1867: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape179, reshape180, reshape181, alloc1869) R.vm.kill_object(reshape179) R.vm.kill_object(reshape180) R.vm.kill_object(reshape181) gv2379: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape182: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1869, gv2379, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1869) gv2380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape183: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape182, gv2380, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape182) model_encoder_layers_22_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[340] model_encoder_layers_22_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[341] gv2381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1870: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2381, R.dtype("float16")) _1868: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_out_proj_weight, reshape183, model_encoder_layers_22_self_attn_out_proj_bias, alloc1870) R.vm.kill_object(reshape183) R.vm.kill_object(model_encoder_layers_22_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_22_self_attn_out_proj_bias) gv2382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1871: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2382, R.dtype("float16")) cls.add4(alloc1864, alloc1870, alloc1871) R.vm.kill_object(alloc1864) R.vm.kill_object(alloc1870) model_encoder_layers_22_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[348] model_encoder_layers_22_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[349] gv2383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1872: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2383, R.dtype("float16")) cls.layer_norm1(alloc1871, model_encoder_layers_22_final_layer_norm_weight, model_encoder_layers_22_final_layer_norm_bias, alloc1872) R.vm.kill_object(model_encoder_layers_22_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_22_final_layer_norm_bias) model_encoder_layers_22_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[344] model_encoder_layers_22_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[345] gv2384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1873: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2384, R.dtype("float16")) _1871: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_22_fc1_weight, alloc1872, model_encoder_layers_22_fc1_bias, alloc1873) R.vm.kill_object(alloc1872) R.vm.kill_object(model_encoder_layers_22_fc1_weight) R.vm.kill_object(model_encoder_layers_22_fc1_bias) model_encoder_layers_22_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[346] model_encoder_layers_22_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[347] gv2385: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1874: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2385, R.dtype("float16")) _1872: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_22_fc2_weight, alloc1873, model_encoder_layers_22_fc2_bias, alloc1874) R.vm.kill_object(alloc1873) R.vm.kill_object(model_encoder_layers_22_fc2_weight) R.vm.kill_object(model_encoder_layers_22_fc2_bias) gv2386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1875: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2386, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1871, alloc1874, alloc1875) R.vm.kill_object(alloc1871) R.vm.kill_object(alloc1874) model_encoder_layers_23_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[357] model_encoder_layers_23_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[358] gv2387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1876: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2387, R.dtype("float16")) cls.layer_norm1(alloc1875, model_encoder_layers_23_self_attn_layer_norm_weight, model_encoder_layers_23_self_attn_layer_norm_bias, alloc1876) R.vm.kill_object(model_encoder_layers_23_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_23_self_attn_layer_norm_bias) model_encoder_layers_23_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[353] model_encoder_layers_23_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[354] gv2388: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1877: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2388, R.dtype("float16")) _1875: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_q_proj_weight, alloc1876, model_encoder_layers_23_self_attn_q_proj_bias, alloc1877) R.vm.kill_object(model_encoder_layers_23_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_23_self_attn_q_proj_bias) gv2389: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape184: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1877, gv2389, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1877) model_encoder_layers_23_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[350] gv2390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1878: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2390, R.dtype("float16")) _1876: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_23_self_attn_k_proj_weight, alloc1876, alloc1878) R.vm.kill_object(model_encoder_layers_23_self_attn_k_proj_weight) gv2391: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape185: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1878, gv2391, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1878) model_encoder_layers_23_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[351] model_encoder_layers_23_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[352] gv2392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1879: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2392, R.dtype("float16")) _1877: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_v_proj_weight, alloc1876, model_encoder_layers_23_self_attn_v_proj_bias, alloc1879) R.vm.kill_object(alloc1876) R.vm.kill_object(model_encoder_layers_23_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_23_self_attn_v_proj_bias) gv2393: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape186: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1879, gv2393, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1879) gv2394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape187: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape184, gv2394, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape184) gv2395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape188: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape185, gv2395, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape185) gv2396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape189: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape186, gv2396, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape186) gv2397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1880: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2397, R.dtype("float16")) _1878: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape187, reshape188, reshape189, alloc1880) R.vm.kill_object(reshape187) R.vm.kill_object(reshape188) R.vm.kill_object(reshape189) gv2398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape190: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1880, gv2398, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1880) gv2399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape191: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape190, gv2399, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape190) model_encoder_layers_23_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[355] model_encoder_layers_23_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[356] gv2400: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1881: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2400, R.dtype("float16")) _1879: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_out_proj_weight, reshape191, model_encoder_layers_23_self_attn_out_proj_bias, alloc1881) R.vm.kill_object(reshape191) R.vm.kill_object(model_encoder_layers_23_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_23_self_attn_out_proj_bias) gv2401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1882: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2401, R.dtype("float16")) cls.add4(alloc1875, alloc1881, alloc1882) R.vm.kill_object(alloc1875) R.vm.kill_object(alloc1881) model_encoder_layers_23_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[363] model_encoder_layers_23_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[364] gv2402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1883: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2402, R.dtype("float16")) cls.layer_norm1(alloc1882, model_encoder_layers_23_final_layer_norm_weight, model_encoder_layers_23_final_layer_norm_bias, alloc1883) R.vm.kill_object(model_encoder_layers_23_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_23_final_layer_norm_bias) model_encoder_layers_23_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[359] model_encoder_layers_23_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[360] gv2403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1884: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2403, R.dtype("float16")) _1882: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_23_fc1_weight, alloc1883, model_encoder_layers_23_fc1_bias, alloc1884) R.vm.kill_object(alloc1883) R.vm.kill_object(model_encoder_layers_23_fc1_weight) R.vm.kill_object(model_encoder_layers_23_fc1_bias) model_encoder_layers_23_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[361] model_encoder_layers_23_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[362] gv2404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1885: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2404, R.dtype("float16")) _1883: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_23_fc2_weight, alloc1884, model_encoder_layers_23_fc2_bias, alloc1885) R.vm.kill_object(alloc1884) R.vm.kill_object(model_encoder_layers_23_fc2_weight) R.vm.kill_object(model_encoder_layers_23_fc2_bias) gv2405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1886: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2405, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1882, alloc1885, alloc1886) R.vm.kill_object(alloc1882) R.vm.kill_object(alloc1885) model_encoder_layers_24_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[372] model_encoder_layers_24_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[373] gv2406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1887: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2406, R.dtype("float16")) cls.layer_norm1(alloc1886, model_encoder_layers_24_self_attn_layer_norm_weight, model_encoder_layers_24_self_attn_layer_norm_bias, alloc1887) R.vm.kill_object(model_encoder_layers_24_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_24_self_attn_layer_norm_bias) model_encoder_layers_24_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[368] model_encoder_layers_24_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[369] gv2407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1888: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2407, R.dtype("float16")) _1886: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_q_proj_weight, alloc1887, model_encoder_layers_24_self_attn_q_proj_bias, alloc1888) R.vm.kill_object(model_encoder_layers_24_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_24_self_attn_q_proj_bias) gv2408: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape192: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1888, gv2408, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1888) model_encoder_layers_24_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[365] gv2409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1889: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2409, R.dtype("float16")) _1887: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_24_self_attn_k_proj_weight, alloc1887, alloc1889) R.vm.kill_object(model_encoder_layers_24_self_attn_k_proj_weight) gv2410: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape193: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1889, gv2410, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1889) model_encoder_layers_24_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[366] model_encoder_layers_24_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[367] gv2411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1890: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2411, R.dtype("float16")) _1888: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_v_proj_weight, alloc1887, model_encoder_layers_24_self_attn_v_proj_bias, alloc1890) R.vm.kill_object(alloc1887) R.vm.kill_object(model_encoder_layers_24_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_24_self_attn_v_proj_bias) gv2412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape194: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1890, gv2412, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1890) gv2413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape195: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape192, gv2413, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape192) gv2414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape196: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape193, gv2414, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape193) gv2415: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape197: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape194, gv2415, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape194) gv2416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1891: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2416, R.dtype("float16")) _1889: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape195, reshape196, reshape197, alloc1891) R.vm.kill_object(reshape195) R.vm.kill_object(reshape196) R.vm.kill_object(reshape197) gv2417: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape198: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1891, gv2417, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1891) gv2418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape199: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape198, gv2418, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape198) model_encoder_layers_24_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[370] model_encoder_layers_24_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[371] gv2419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1892: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2419, R.dtype("float16")) _1890: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_out_proj_weight, reshape199, model_encoder_layers_24_self_attn_out_proj_bias, alloc1892) R.vm.kill_object(reshape199) R.vm.kill_object(model_encoder_layers_24_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_24_self_attn_out_proj_bias) gv2420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1893: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2420, R.dtype("float16")) cls.add4(alloc1886, alloc1892, alloc1893) R.vm.kill_object(alloc1886) R.vm.kill_object(alloc1892) model_encoder_layers_24_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[378] model_encoder_layers_24_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[379] gv2421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1894: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2421, R.dtype("float16")) cls.layer_norm1(alloc1893, model_encoder_layers_24_final_layer_norm_weight, model_encoder_layers_24_final_layer_norm_bias, alloc1894) R.vm.kill_object(model_encoder_layers_24_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_24_final_layer_norm_bias) model_encoder_layers_24_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[374] model_encoder_layers_24_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[375] gv2422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1895: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2422, R.dtype("float16")) _1893: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_24_fc1_weight, alloc1894, model_encoder_layers_24_fc1_bias, alloc1895) R.vm.kill_object(alloc1894) R.vm.kill_object(model_encoder_layers_24_fc1_weight) R.vm.kill_object(model_encoder_layers_24_fc1_bias) model_encoder_layers_24_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[376] model_encoder_layers_24_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[377] gv2423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1896: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2423, R.dtype("float16")) _1894: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_24_fc2_weight, alloc1895, model_encoder_layers_24_fc2_bias, alloc1896) R.vm.kill_object(alloc1895) R.vm.kill_object(model_encoder_layers_24_fc2_weight) R.vm.kill_object(model_encoder_layers_24_fc2_bias) gv2424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1897: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2424, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1893, alloc1896, alloc1897) R.vm.kill_object(alloc1893) R.vm.kill_object(alloc1896) model_encoder_layers_25_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[387] model_encoder_layers_25_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[388] gv2425: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1898: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2425, R.dtype("float16")) cls.layer_norm1(alloc1897, model_encoder_layers_25_self_attn_layer_norm_weight, model_encoder_layers_25_self_attn_layer_norm_bias, alloc1898) R.vm.kill_object(model_encoder_layers_25_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_25_self_attn_layer_norm_bias) model_encoder_layers_25_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[383] model_encoder_layers_25_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[384] gv2426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1899: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2426, R.dtype("float16")) _1897: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_q_proj_weight, alloc1898, model_encoder_layers_25_self_attn_q_proj_bias, alloc1899) R.vm.kill_object(model_encoder_layers_25_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_25_self_attn_q_proj_bias) gv2427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape200: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1899, gv2427, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1899) model_encoder_layers_25_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[380] gv2428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1900: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2428, R.dtype("float16")) _1898: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_25_self_attn_k_proj_weight, alloc1898, alloc1900) R.vm.kill_object(model_encoder_layers_25_self_attn_k_proj_weight) gv2429: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape201: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1900, gv2429, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1900) model_encoder_layers_25_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[381] model_encoder_layers_25_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[382] gv2430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1901: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2430, R.dtype("float16")) _1899: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_v_proj_weight, alloc1898, model_encoder_layers_25_self_attn_v_proj_bias, alloc1901) R.vm.kill_object(alloc1898) R.vm.kill_object(model_encoder_layers_25_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_25_self_attn_v_proj_bias) gv2431: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape202: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1901, gv2431, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1901) gv2432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape203: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape200, gv2432, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape200) gv2433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape204: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape201, gv2433, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape201) gv2434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape205: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape202, gv2434, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape202) gv2435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1902: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2435, R.dtype("float16")) _1900: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape203, reshape204, reshape205, alloc1902) R.vm.kill_object(reshape203) R.vm.kill_object(reshape204) R.vm.kill_object(reshape205) gv2436: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape206: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1902, gv2436, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1902) gv2437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape207: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape206, gv2437, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape206) model_encoder_layers_25_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[385] model_encoder_layers_25_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[386] gv2438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1903: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2438, R.dtype("float16")) _1901: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_out_proj_weight, reshape207, model_encoder_layers_25_self_attn_out_proj_bias, alloc1903) R.vm.kill_object(reshape207) R.vm.kill_object(model_encoder_layers_25_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_25_self_attn_out_proj_bias) gv2439: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1904: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2439, R.dtype("float16")) cls.add4(alloc1897, alloc1903, alloc1904) R.vm.kill_object(alloc1897) R.vm.kill_object(alloc1903) model_encoder_layers_25_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[393] model_encoder_layers_25_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[394] gv2440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1905: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2440, R.dtype("float16")) cls.layer_norm1(alloc1904, model_encoder_layers_25_final_layer_norm_weight, model_encoder_layers_25_final_layer_norm_bias, alloc1905) R.vm.kill_object(model_encoder_layers_25_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_25_final_layer_norm_bias) model_encoder_layers_25_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[389] model_encoder_layers_25_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[390] gv2441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1906: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2441, R.dtype("float16")) _1904: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_25_fc1_weight, alloc1905, model_encoder_layers_25_fc1_bias, alloc1906) R.vm.kill_object(alloc1905) R.vm.kill_object(model_encoder_layers_25_fc1_weight) R.vm.kill_object(model_encoder_layers_25_fc1_bias) model_encoder_layers_25_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[391] model_encoder_layers_25_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[392] gv2442: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1907: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2442, R.dtype("float16")) _1905: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_25_fc2_weight, alloc1906, model_encoder_layers_25_fc2_bias, alloc1907) R.vm.kill_object(alloc1906) R.vm.kill_object(model_encoder_layers_25_fc2_weight) R.vm.kill_object(model_encoder_layers_25_fc2_bias) gv2443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1908: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2443, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1904, alloc1907, alloc1908) R.vm.kill_object(alloc1904) R.vm.kill_object(alloc1907) model_encoder_layers_26_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[402] model_encoder_layers_26_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[403] gv2444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1909: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2444, R.dtype("float16")) cls.layer_norm1(alloc1908, model_encoder_layers_26_self_attn_layer_norm_weight, model_encoder_layers_26_self_attn_layer_norm_bias, alloc1909) R.vm.kill_object(model_encoder_layers_26_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_26_self_attn_layer_norm_bias) model_encoder_layers_26_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[398] model_encoder_layers_26_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[399] gv2445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1910: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2445, R.dtype("float16")) _1908: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_q_proj_weight, alloc1909, model_encoder_layers_26_self_attn_q_proj_bias, alloc1910) R.vm.kill_object(model_encoder_layers_26_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_26_self_attn_q_proj_bias) gv2446: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape208: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1910, gv2446, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1910) model_encoder_layers_26_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[395] gv2447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1911: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2447, R.dtype("float16")) _1909: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_26_self_attn_k_proj_weight, alloc1909, alloc1911) R.vm.kill_object(model_encoder_layers_26_self_attn_k_proj_weight) gv2448: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape209: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1911, gv2448, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1911) model_encoder_layers_26_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[396] model_encoder_layers_26_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[397] gv2449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1912: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2449, R.dtype("float16")) _1910: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_v_proj_weight, alloc1909, model_encoder_layers_26_self_attn_v_proj_bias, alloc1912) R.vm.kill_object(alloc1909) R.vm.kill_object(model_encoder_layers_26_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_26_self_attn_v_proj_bias) gv2450: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape210: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1912, gv2450, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1912) gv2451: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape211: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape208, gv2451, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape208) gv2452: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape212: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape209, gv2452, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape209) gv2453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape213: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape210, gv2453, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape210) gv2454: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1913: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2454, R.dtype("float16")) _1911: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape211, reshape212, reshape213, alloc1913) R.vm.kill_object(reshape211) R.vm.kill_object(reshape212) R.vm.kill_object(reshape213) gv2455: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape214: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1913, gv2455, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1913) gv2456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape215: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape214, gv2456, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape214) model_encoder_layers_26_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[400] model_encoder_layers_26_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[401] gv2457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1914: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2457, R.dtype("float16")) _1912: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_out_proj_weight, reshape215, model_encoder_layers_26_self_attn_out_proj_bias, alloc1914) R.vm.kill_object(reshape215) R.vm.kill_object(model_encoder_layers_26_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_26_self_attn_out_proj_bias) gv2458: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1915: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2458, R.dtype("float16")) cls.add4(alloc1908, alloc1914, alloc1915) R.vm.kill_object(alloc1908) R.vm.kill_object(alloc1914) model_encoder_layers_26_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[408] model_encoder_layers_26_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[409] gv2459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1916: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2459, R.dtype("float16")) cls.layer_norm1(alloc1915, model_encoder_layers_26_final_layer_norm_weight, model_encoder_layers_26_final_layer_norm_bias, alloc1916) R.vm.kill_object(model_encoder_layers_26_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_26_final_layer_norm_bias) model_encoder_layers_26_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[404] model_encoder_layers_26_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[405] gv2460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1917: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2460, R.dtype("float16")) _1915: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_26_fc1_weight, alloc1916, model_encoder_layers_26_fc1_bias, alloc1917) R.vm.kill_object(alloc1916) R.vm.kill_object(model_encoder_layers_26_fc1_weight) R.vm.kill_object(model_encoder_layers_26_fc1_bias) model_encoder_layers_26_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[406] model_encoder_layers_26_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[407] gv2461: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1918: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2461, R.dtype("float16")) _1916: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_26_fc2_weight, alloc1917, model_encoder_layers_26_fc2_bias, alloc1918) R.vm.kill_object(alloc1917) R.vm.kill_object(model_encoder_layers_26_fc2_weight) R.vm.kill_object(model_encoder_layers_26_fc2_bias) gv2462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1919: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2462, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1915, alloc1918, alloc1919) R.vm.kill_object(alloc1915) R.vm.kill_object(alloc1918) model_encoder_layers_27_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[417] model_encoder_layers_27_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[418] gv2463: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1920: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2463, R.dtype("float16")) cls.layer_norm1(alloc1919, model_encoder_layers_27_self_attn_layer_norm_weight, model_encoder_layers_27_self_attn_layer_norm_bias, alloc1920) R.vm.kill_object(model_encoder_layers_27_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_27_self_attn_layer_norm_bias) model_encoder_layers_27_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[413] model_encoder_layers_27_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[414] gv2464: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1921: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2464, R.dtype("float16")) _1919: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_q_proj_weight, alloc1920, model_encoder_layers_27_self_attn_q_proj_bias, alloc1921) R.vm.kill_object(model_encoder_layers_27_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_27_self_attn_q_proj_bias) gv2465: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape216: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1921, gv2465, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1921) model_encoder_layers_27_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[410] gv2466: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1922: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2466, R.dtype("float16")) _1920: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_27_self_attn_k_proj_weight, alloc1920, alloc1922) R.vm.kill_object(model_encoder_layers_27_self_attn_k_proj_weight) gv2467: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape217: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1922, gv2467, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1922) model_encoder_layers_27_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[411] model_encoder_layers_27_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[412] gv2468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1923: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2468, R.dtype("float16")) _1921: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_v_proj_weight, alloc1920, model_encoder_layers_27_self_attn_v_proj_bias, alloc1923) R.vm.kill_object(alloc1920) R.vm.kill_object(model_encoder_layers_27_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_27_self_attn_v_proj_bias) gv2469: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape218: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1923, gv2469, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1923) gv2470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape219: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape216, gv2470, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape216) gv2471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape220: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape217, gv2471, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape217) gv2472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape221: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape218, gv2472, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape218) gv2473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1924: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2473, R.dtype("float16")) _1922: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape219, reshape220, reshape221, alloc1924) R.vm.kill_object(reshape219) R.vm.kill_object(reshape220) R.vm.kill_object(reshape221) gv2474: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape222: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1924, gv2474, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1924) gv2475: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape223: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape222, gv2475, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape222) model_encoder_layers_27_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[415] model_encoder_layers_27_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[416] gv2476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1925: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2476, R.dtype("float16")) _1923: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_out_proj_weight, reshape223, model_encoder_layers_27_self_attn_out_proj_bias, alloc1925) R.vm.kill_object(reshape223) R.vm.kill_object(model_encoder_layers_27_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_27_self_attn_out_proj_bias) gv2477: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1926: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2477, R.dtype("float16")) cls.add4(alloc1919, alloc1925, alloc1926) R.vm.kill_object(alloc1919) R.vm.kill_object(alloc1925) model_encoder_layers_27_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[423] model_encoder_layers_27_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[424] gv2478: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1927: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2478, R.dtype("float16")) cls.layer_norm1(alloc1926, model_encoder_layers_27_final_layer_norm_weight, model_encoder_layers_27_final_layer_norm_bias, alloc1927) R.vm.kill_object(model_encoder_layers_27_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_27_final_layer_norm_bias) model_encoder_layers_27_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[419] model_encoder_layers_27_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[420] gv2479: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1928: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2479, R.dtype("float16")) _1926: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_27_fc1_weight, alloc1927, model_encoder_layers_27_fc1_bias, alloc1928) R.vm.kill_object(alloc1927) R.vm.kill_object(model_encoder_layers_27_fc1_weight) R.vm.kill_object(model_encoder_layers_27_fc1_bias) model_encoder_layers_27_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[421] model_encoder_layers_27_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[422] gv2480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1929: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2480, R.dtype("float16")) _1927: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_27_fc2_weight, alloc1928, model_encoder_layers_27_fc2_bias, alloc1929) R.vm.kill_object(alloc1928) R.vm.kill_object(model_encoder_layers_27_fc2_weight) R.vm.kill_object(model_encoder_layers_27_fc2_bias) gv2481: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1930: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2481, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1926, alloc1929, alloc1930) R.vm.kill_object(alloc1926) R.vm.kill_object(alloc1929) model_encoder_layers_28_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[432] model_encoder_layers_28_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[433] gv2482: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1931: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2482, R.dtype("float16")) cls.layer_norm1(alloc1930, model_encoder_layers_28_self_attn_layer_norm_weight, model_encoder_layers_28_self_attn_layer_norm_bias, alloc1931) R.vm.kill_object(model_encoder_layers_28_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_28_self_attn_layer_norm_bias) model_encoder_layers_28_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[428] model_encoder_layers_28_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[429] gv2483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1932: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2483, R.dtype("float16")) _1930: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_q_proj_weight, alloc1931, model_encoder_layers_28_self_attn_q_proj_bias, alloc1932) R.vm.kill_object(model_encoder_layers_28_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_28_self_attn_q_proj_bias) gv2484: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape224: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1932, gv2484, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1932) model_encoder_layers_28_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[425] gv2485: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1933: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2485, R.dtype("float16")) _1931: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_28_self_attn_k_proj_weight, alloc1931, alloc1933) R.vm.kill_object(model_encoder_layers_28_self_attn_k_proj_weight) gv2486: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape225: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1933, gv2486, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1933) model_encoder_layers_28_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[426] model_encoder_layers_28_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[427] gv2487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1934: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2487, R.dtype("float16")) _1932: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_v_proj_weight, alloc1931, model_encoder_layers_28_self_attn_v_proj_bias, alloc1934) R.vm.kill_object(alloc1931) R.vm.kill_object(model_encoder_layers_28_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_28_self_attn_v_proj_bias) gv2488: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape226: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1934, gv2488, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1934) gv2489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape227: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape224, gv2489, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape224) gv2490: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape228: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape225, gv2490, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape225) gv2491: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape229: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape226, gv2491, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape226) gv2492: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1935: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2492, R.dtype("float16")) _1933: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape227, reshape228, reshape229, alloc1935) R.vm.kill_object(reshape227) R.vm.kill_object(reshape228) R.vm.kill_object(reshape229) gv2493: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape230: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1935, gv2493, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1935) gv2494: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape231: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape230, gv2494, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape230) model_encoder_layers_28_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[430] model_encoder_layers_28_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[431] gv2495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1936: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2495, R.dtype("float16")) _1934: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_out_proj_weight, reshape231, model_encoder_layers_28_self_attn_out_proj_bias, alloc1936) R.vm.kill_object(reshape231) R.vm.kill_object(model_encoder_layers_28_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_28_self_attn_out_proj_bias) gv2496: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1937: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2496, R.dtype("float16")) cls.add4(alloc1930, alloc1936, alloc1937) R.vm.kill_object(alloc1930) R.vm.kill_object(alloc1936) model_encoder_layers_28_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[438] model_encoder_layers_28_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[439] gv2497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1938: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2497, R.dtype("float16")) cls.layer_norm1(alloc1937, model_encoder_layers_28_final_layer_norm_weight, model_encoder_layers_28_final_layer_norm_bias, alloc1938) R.vm.kill_object(model_encoder_layers_28_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_28_final_layer_norm_bias) model_encoder_layers_28_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[434] model_encoder_layers_28_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[435] gv2498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1939: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2498, R.dtype("float16")) _1937: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_28_fc1_weight, alloc1938, model_encoder_layers_28_fc1_bias, alloc1939) R.vm.kill_object(alloc1938) R.vm.kill_object(model_encoder_layers_28_fc1_weight) R.vm.kill_object(model_encoder_layers_28_fc1_bias) model_encoder_layers_28_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[436] model_encoder_layers_28_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[437] gv2499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1940: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2499, R.dtype("float16")) _1938: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_28_fc2_weight, alloc1939, model_encoder_layers_28_fc2_bias, alloc1940) R.vm.kill_object(alloc1939) R.vm.kill_object(model_encoder_layers_28_fc2_weight) R.vm.kill_object(model_encoder_layers_28_fc2_bias) gv2500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1941: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2500, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1937, alloc1940, alloc1941) R.vm.kill_object(alloc1937) R.vm.kill_object(alloc1940) model_encoder_layers_29_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[447] model_encoder_layers_29_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[448] gv2501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1942: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2501, R.dtype("float16")) cls.layer_norm1(alloc1941, model_encoder_layers_29_self_attn_layer_norm_weight, model_encoder_layers_29_self_attn_layer_norm_bias, alloc1942) R.vm.kill_object(model_encoder_layers_29_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_29_self_attn_layer_norm_bias) model_encoder_layers_29_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[443] model_encoder_layers_29_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[444] gv2502: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1943: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2502, R.dtype("float16")) _1941: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_q_proj_weight, alloc1942, model_encoder_layers_29_self_attn_q_proj_bias, alloc1943) R.vm.kill_object(model_encoder_layers_29_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_29_self_attn_q_proj_bias) gv2503: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape232: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1943, gv2503, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1943) model_encoder_layers_29_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[440] gv2504: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1944: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2504, R.dtype("float16")) _1942: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_29_self_attn_k_proj_weight, alloc1942, alloc1944) R.vm.kill_object(model_encoder_layers_29_self_attn_k_proj_weight) gv2505: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape233: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1944, gv2505, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1944) model_encoder_layers_29_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[441] model_encoder_layers_29_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[442] gv2506: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1945: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2506, R.dtype("float16")) _1943: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_v_proj_weight, alloc1942, model_encoder_layers_29_self_attn_v_proj_bias, alloc1945) R.vm.kill_object(alloc1942) R.vm.kill_object(model_encoder_layers_29_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_29_self_attn_v_proj_bias) gv2507: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape234: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1945, gv2507, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1945) gv2508: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape235: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape232, gv2508, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape232) gv2509: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape236: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape233, gv2509, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape233) gv2510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape237: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape234, gv2510, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape234) gv2511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1946: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2511, R.dtype("float16")) _1944: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape235, reshape236, reshape237, alloc1946) R.vm.kill_object(reshape235) R.vm.kill_object(reshape236) R.vm.kill_object(reshape237) gv2512: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape238: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1946, gv2512, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1946) gv2513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape239: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape238, gv2513, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape238) model_encoder_layers_29_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[445] model_encoder_layers_29_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[446] gv2514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1947: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2514, R.dtype("float16")) _1945: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_out_proj_weight, reshape239, model_encoder_layers_29_self_attn_out_proj_bias, alloc1947) R.vm.kill_object(reshape239) R.vm.kill_object(model_encoder_layers_29_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_29_self_attn_out_proj_bias) gv2515: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1948: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2515, R.dtype("float16")) cls.add4(alloc1941, alloc1947, alloc1948) R.vm.kill_object(alloc1941) R.vm.kill_object(alloc1947) model_encoder_layers_29_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[453] model_encoder_layers_29_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[454] gv2516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1949: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2516, R.dtype("float16")) cls.layer_norm1(alloc1948, model_encoder_layers_29_final_layer_norm_weight, model_encoder_layers_29_final_layer_norm_bias, alloc1949) R.vm.kill_object(model_encoder_layers_29_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_29_final_layer_norm_bias) model_encoder_layers_29_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[449] model_encoder_layers_29_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[450] gv2517: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1950: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2517, R.dtype("float16")) _1948: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_29_fc1_weight, alloc1949, model_encoder_layers_29_fc1_bias, alloc1950) R.vm.kill_object(alloc1949) R.vm.kill_object(model_encoder_layers_29_fc1_weight) R.vm.kill_object(model_encoder_layers_29_fc1_bias) model_encoder_layers_29_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[451] model_encoder_layers_29_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[452] gv2518: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1951: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2518, R.dtype("float16")) _1949: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_29_fc2_weight, alloc1950, model_encoder_layers_29_fc2_bias, alloc1951) R.vm.kill_object(alloc1950) R.vm.kill_object(model_encoder_layers_29_fc2_weight) R.vm.kill_object(model_encoder_layers_29_fc2_bias) gv2519: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1952: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2519, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1948, alloc1951, alloc1952) R.vm.kill_object(alloc1948) R.vm.kill_object(alloc1951) model_encoder_layers_30_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[462] model_encoder_layers_30_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[463] gv2520: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1953: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2520, R.dtype("float16")) cls.layer_norm1(alloc1952, model_encoder_layers_30_self_attn_layer_norm_weight, model_encoder_layers_30_self_attn_layer_norm_bias, alloc1953) R.vm.kill_object(model_encoder_layers_30_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_30_self_attn_layer_norm_bias) model_encoder_layers_30_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[458] model_encoder_layers_30_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[459] gv2521: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1954: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2521, R.dtype("float16")) _1952: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_q_proj_weight, alloc1953, model_encoder_layers_30_self_attn_q_proj_bias, alloc1954) R.vm.kill_object(model_encoder_layers_30_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_30_self_attn_q_proj_bias) gv2522: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape240: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1954, gv2522, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1954) model_encoder_layers_30_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[455] gv2523: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1955: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2523, R.dtype("float16")) _1953: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_30_self_attn_k_proj_weight, alloc1953, alloc1955) R.vm.kill_object(model_encoder_layers_30_self_attn_k_proj_weight) gv2524: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape241: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1955, gv2524, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1955) model_encoder_layers_30_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[456] model_encoder_layers_30_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[457] gv2525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1956: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2525, R.dtype("float16")) _1954: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_v_proj_weight, alloc1953, model_encoder_layers_30_self_attn_v_proj_bias, alloc1956) R.vm.kill_object(alloc1953) R.vm.kill_object(model_encoder_layers_30_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_30_self_attn_v_proj_bias) gv2526: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape242: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1956, gv2526, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1956) gv2527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape243: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape240, gv2527, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape240) gv2528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape244: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape241, gv2528, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape241) gv2529: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape245: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape242, gv2529, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape242) gv2530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1957: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2530, R.dtype("float16")) _1955: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape243, reshape244, reshape245, alloc1957) R.vm.kill_object(reshape243) R.vm.kill_object(reshape244) R.vm.kill_object(reshape245) gv2531: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape246: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1957, gv2531, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1957) gv2532: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape247: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape246, gv2532, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape246) model_encoder_layers_30_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[460] model_encoder_layers_30_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[461] gv2533: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1958: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2533, R.dtype("float16")) _1956: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_out_proj_weight, reshape247, model_encoder_layers_30_self_attn_out_proj_bias, alloc1958) R.vm.kill_object(reshape247) R.vm.kill_object(model_encoder_layers_30_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_30_self_attn_out_proj_bias) gv2534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1959: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2534, R.dtype("float16")) cls.add4(alloc1952, alloc1958, alloc1959) R.vm.kill_object(alloc1952) R.vm.kill_object(alloc1958) model_encoder_layers_30_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[468] model_encoder_layers_30_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[469] gv2535: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1960: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2535, R.dtype("float16")) cls.layer_norm1(alloc1959, model_encoder_layers_30_final_layer_norm_weight, model_encoder_layers_30_final_layer_norm_bias, alloc1960) R.vm.kill_object(model_encoder_layers_30_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_30_final_layer_norm_bias) model_encoder_layers_30_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[464] model_encoder_layers_30_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[465] gv2536: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1961: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2536, R.dtype("float16")) _1959: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_30_fc1_weight, alloc1960, model_encoder_layers_30_fc1_bias, alloc1961) R.vm.kill_object(alloc1960) R.vm.kill_object(model_encoder_layers_30_fc1_weight) R.vm.kill_object(model_encoder_layers_30_fc1_bias) model_encoder_layers_30_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[466] model_encoder_layers_30_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[467] gv2537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1962: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2537, R.dtype("float16")) _1960: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_30_fc2_weight, alloc1961, model_encoder_layers_30_fc2_bias, alloc1962) R.vm.kill_object(alloc1961) R.vm.kill_object(model_encoder_layers_30_fc2_weight) R.vm.kill_object(model_encoder_layers_30_fc2_bias) gv2538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1963: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2538, R.dtype("float16")) cls.fused_add4_maximum_minimum(alloc1959, alloc1962, alloc1963) R.vm.kill_object(alloc1959) R.vm.kill_object(alloc1962) model_encoder_layers_31_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[477] model_encoder_layers_31_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[478] gv2539: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1964: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2539, R.dtype("float16")) cls.layer_norm1(alloc1963, model_encoder_layers_31_self_attn_layer_norm_weight, model_encoder_layers_31_self_attn_layer_norm_bias, alloc1964) R.vm.kill_object(model_encoder_layers_31_self_attn_layer_norm_weight) R.vm.kill_object(model_encoder_layers_31_self_attn_layer_norm_bias) model_encoder_layers_31_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[473] model_encoder_layers_31_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[474] gv2540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1965: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2540, R.dtype("float16")) _1963: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_q_proj_weight, alloc1964, model_encoder_layers_31_self_attn_q_proj_bias, alloc1965) R.vm.kill_object(model_encoder_layers_31_self_attn_q_proj_weight) R.vm.kill_object(model_encoder_layers_31_self_attn_q_proj_bias) gv2541: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape248: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1965, gv2541, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1965) model_encoder_layers_31_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[470] gv2542: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1966: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2542, R.dtype("float16")) _1964: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_31_self_attn_k_proj_weight, alloc1964, alloc1966) R.vm.kill_object(model_encoder_layers_31_self_attn_k_proj_weight) gv2543: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape249: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1966, gv2543, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1966) model_encoder_layers_31_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[471] model_encoder_layers_31_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[472] gv2544: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1967: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2544, R.dtype("float16")) _1965: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_v_proj_weight, alloc1964, model_encoder_layers_31_self_attn_v_proj_bias, alloc1967) R.vm.kill_object(alloc1964) R.vm.kill_object(model_encoder_layers_31_self_attn_v_proj_weight) R.vm.kill_object(model_encoder_layers_31_self_attn_v_proj_bias) gv2545: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape250: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1967, gv2545, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1967) gv2546: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape251: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape248, gv2546, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape248) gv2547: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape252: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape249, gv2547, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape249) gv2548: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape253: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape250, gv2548, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape250) gv2549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1968: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2549, R.dtype("float16")) _1966: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape251, reshape252, reshape253, alloc1968) R.vm.kill_object(reshape251) R.vm.kill_object(reshape252) R.vm.kill_object(reshape253) gv2550: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape254: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1968, gv2550, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1968) gv2551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape255: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape254, gv2551, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) R.vm.kill_object(reshape254) model_encoder_layers_31_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[475] model_encoder_layers_31_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[476] gv2552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1969: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2552, R.dtype("float16")) _1967: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_out_proj_weight, reshape255, model_encoder_layers_31_self_attn_out_proj_bias, alloc1969) R.vm.kill_object(reshape255) R.vm.kill_object(model_encoder_layers_31_self_attn_out_proj_weight) R.vm.kill_object(model_encoder_layers_31_self_attn_out_proj_bias) gv2553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1970: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2553, R.dtype("float16")) R.vm.kill_object(storage25) cls.add4(alloc1963, alloc1969, alloc1970) R.vm.kill_object(alloc1963) R.vm.kill_object(alloc1969) model_encoder_layers_31_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[483] model_encoder_layers_31_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[484] gv2554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1971: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2554, R.dtype("float16")) R.vm.kill_object(storage28) cls.layer_norm1(alloc1970, model_encoder_layers_31_final_layer_norm_weight, model_encoder_layers_31_final_layer_norm_bias, alloc1971) R.vm.kill_object(model_encoder_layers_31_final_layer_norm_weight) R.vm.kill_object(model_encoder_layers_31_final_layer_norm_bias) model_encoder_layers_31_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[479] model_encoder_layers_31_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[480] gv2555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1972: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2555, R.dtype("float16")) R.vm.kill_object(storage24) _1970: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_31_fc1_weight, alloc1971, model_encoder_layers_31_fc1_bias, alloc1972) R.vm.kill_object(alloc1971) R.vm.kill_object(model_encoder_layers_31_fc1_weight) R.vm.kill_object(model_encoder_layers_31_fc1_bias) model_encoder_layers_31_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[481] model_encoder_layers_31_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[482] gv2556: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1973: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2556, R.dtype("float16")) R.vm.kill_object(storage26) _1971: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_31_fc2_weight, alloc1972, model_encoder_layers_31_fc2_bias, alloc1973) R.vm.kill_object(alloc1972) R.vm.kill_object(model_encoder_layers_31_fc2_weight) R.vm.kill_object(model_encoder_layers_31_fc2_bias) gv2557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1974: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2557, R.dtype("float16")) R.vm.kill_object(storage27) cls.fused_add4_maximum_minimum(alloc1970, alloc1973, alloc1974) R.vm.kill_object(alloc1970) R.vm.kill_object(alloc1973) model_encoder_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[485] model_encoder_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[486] storage29: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2558: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1975: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage29, R.prim_value(0), gv2558, R.dtype("float16")) R.vm.kill_object(storage29) cls.layer_norm1(alloc1974, model_encoder_layer_norm_weight, model_encoder_layer_norm_bias, alloc1975) R.vm.kill_object(alloc1974) R.vm.kill_object(model_encoder_layer_norm_weight) R.vm.kill_object(model_encoder_layer_norm_bias) R.call_packed("vm.builtin.match_shape", alloc1975, shape_heap, R.prim_value(3), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), R.str("ErrorContext(fn=batch_encode, loc=return, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) return alloc1975 @R.function def batch_prefill(input_ids: R.Tensor((1, "seq_len"), dtype="int32"), logit_positions: R.Tensor(("batch_size",), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, "batch_size", 51866), dtype="float32"): batch_size = T.int64() seq_len = T.int64() R.func_attr({"num_input": 3, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=batch_prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", logit_positions, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=batch_prefill, loc=param[1], param=logit_positions, annotation=R.Tensor((batch_size,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_prefill, loc=param[3], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=batch_prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", logit_positions, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=batch_prefill, loc=param[1], param=logit_positions, annotation=R.Tensor((batch_size,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) model_decoder_embed_tokens_weight2: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] gv10: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) reshape384: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv10, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) model_decoder_embed_tokens_weight2_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] storage4: R.Object = R.vm.alloc_storage(R.shape([153600000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv11: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) alloc4: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv11, R.dtype("float16")) cls.take(model_decoder_embed_tokens_weight2_1, reshape384, alloc4) R.vm.kill_object(reshape384) R.vm.kill_object(model_decoder_embed_tokens_weight2_1) gv12: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape385: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc4, gv12, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(alloc4) lv68: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) model_decoder_embed_positions_weight2: R.Tensor((448, 1280), dtype="float16") = packed_params[488] storage5: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv13: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) alloc5: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv13, R.dtype("float16")) cls.take1(model_decoder_embed_positions_weight2, lv68, alloc5) R.vm.kill_object(lv68) R.vm.kill_object(model_decoder_embed_positions_weight2) gv14: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape386: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc5, gv14, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(alloc5) storage6: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv15: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc6: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv15, R.dtype("float16")) cls.add5(reshape385, reshape386, alloc6) R.vm.kill_object(reshape385) R.vm.kill_object(reshape386) model_decoder_layers_0_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[496] model_decoder_layers_0_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[497] gv16: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc7: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv16, R.dtype("float16")) cls.layer_norm2(alloc6, model_decoder_layers_0_self_attn_layer_norm_weight2, model_decoder_layers_0_self_attn_layer_norm_bias2, alloc7) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias2) model_decoder_layers_0_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] model_decoder_layers_0_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[493] gv17: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc8: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv17, R.dtype("float16")) _6: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_q_proj_weight2, alloc7, model_decoder_layers_0_self_attn_q_proj_bias2, alloc8) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias2) gv18: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape387: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc8, gv18, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc8) model_decoder_layers_0_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] storage7: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv19: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc9: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv19, R.dtype("float16")) _7: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_0_self_attn_k_proj_weight2, alloc7, alloc9) R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight2) gv20: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape388: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc9, gv20, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc9) model_decoder_layers_0_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] model_decoder_layers_0_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[491] storage8: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv21: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc10: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv21, R.dtype("float16")) _8: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_v_proj_weight2, alloc7, model_decoder_layers_0_self_attn_v_proj_bias2, alloc10) R.vm.kill_object(alloc7) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias2) gv22: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape389: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc10, gv22, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc10) gv23: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc11: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv23, R.dtype("float16")) cls.concatenate1(reshape387, reshape388, reshape389, alloc11) R.vm.kill_object(reshape387) R.vm.kill_object(reshape388) R.vm.kill_object(reshape389) gv24: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape390: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc11, gv24, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc11) gv25: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc12: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv25, R.dtype("float16")) _10: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape390, alloc12) R.vm.kill_object(reshape390) gv26: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape391: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc12, gv26, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc12) gv27: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape392: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape391, gv27, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape391) model_decoder_layers_0_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] model_decoder_layers_0_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[495] gv28: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc13: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv28, R.dtype("float16")) _11: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_out_proj_weight2, reshape392, model_decoder_layers_0_self_attn_out_proj_bias2, alloc13) R.vm.kill_object(reshape392) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias2) gv29: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc14: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv29, R.dtype("float16")) cls.add5(alloc6, alloc13, alloc14) R.vm.kill_object(alloc6) R.vm.kill_object(alloc13) model_decoder_layers_0_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[505] model_decoder_layers_0_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[506] gv30: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc15: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv30, R.dtype("float16")) cls.layer_norm2(alloc14, model_decoder_layers_0_encoder_attn_layer_norm_weight2, model_decoder_layers_0_encoder_attn_layer_norm_bias2, alloc15) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias2) model_decoder_layers_0_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] model_decoder_layers_0_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[502] gv31: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc16: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv31, R.dtype("float16")) _14: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight2, alloc15, model_decoder_layers_0_encoder_attn_q_proj_bias2, alloc16) R.vm.kill_object(alloc15) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias2) gv32: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape393: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc16, gv32, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc16) gv33: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape394: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape393, gv33, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape393) gv34: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc17: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv34, R.dtype("float16")) _15: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape394, alloc17) R.vm.kill_object(reshape394) gv35: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape395: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc17, gv35, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc17) gv36: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape396: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape395, gv36, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape395) model_decoder_layers_0_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] model_decoder_layers_0_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[504] gv37: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc18: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv37, R.dtype("float16")) _16: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight2, reshape396, model_decoder_layers_0_encoder_attn_out_proj_bias2, alloc18) R.vm.kill_object(reshape396) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias2) gv38: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc19: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv38, R.dtype("float16")) cls.add5(alloc14, alloc18, alloc19) R.vm.kill_object(alloc14) R.vm.kill_object(alloc18) model_decoder_layers_0_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[511] model_decoder_layers_0_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[512] gv39: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc20: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv39, R.dtype("float16")) cls.layer_norm2(alloc19, model_decoder_layers_0_final_layer_norm_weight2, model_decoder_layers_0_final_layer_norm_bias2, alloc20) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias2) model_decoder_layers_0_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] model_decoder_layers_0_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[508] gv40: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc21: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv40, R.dtype("float16")) _19: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_0_fc1_weight2, alloc20, model_decoder_layers_0_fc1_bias2, alloc21) R.vm.kill_object(alloc20) R.vm.kill_object(model_decoder_layers_0_fc1_weight2) R.vm.kill_object(model_decoder_layers_0_fc1_bias2) model_decoder_layers_0_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] model_decoder_layers_0_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[510] gv41: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc22: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv41, R.dtype("float16")) _20: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_0_fc2_weight2, alloc21, model_decoder_layers_0_fc2_bias2, alloc22) R.vm.kill_object(alloc21) R.vm.kill_object(model_decoder_layers_0_fc2_weight2) R.vm.kill_object(model_decoder_layers_0_fc2_bias2) gv42: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc23: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv42, R.dtype("float16")) cls.add5(alloc19, alloc22, alloc23) R.vm.kill_object(alloc19) R.vm.kill_object(alloc22) model_decoder_layers_1_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[520] model_decoder_layers_1_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[521] gv43: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc24: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv43, R.dtype("float16")) cls.layer_norm2(alloc23, model_decoder_layers_1_self_attn_layer_norm_weight2, model_decoder_layers_1_self_attn_layer_norm_bias2, alloc24) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias2) model_decoder_layers_1_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] model_decoder_layers_1_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[517] gv44: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc25: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv44, R.dtype("float16")) _23: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_q_proj_weight2, alloc24, model_decoder_layers_1_self_attn_q_proj_bias2, alloc25) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias2) gv45: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape397: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc25, gv45, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc25) model_decoder_layers_1_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] gv46: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc26: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv46, R.dtype("float16")) _24: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_1_self_attn_k_proj_weight2, alloc24, alloc26) R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight2) gv47: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape398: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc26, gv47, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc26) model_decoder_layers_1_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] model_decoder_layers_1_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[515] gv48: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc27: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv48, R.dtype("float16")) _25: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_v_proj_weight2, alloc24, model_decoder_layers_1_self_attn_v_proj_bias2, alloc27) R.vm.kill_object(alloc24) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias2) gv49: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape399: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc27, gv49, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc27) gv50: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc28: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv50, R.dtype("float16")) cls.concatenate1(reshape397, reshape398, reshape399, alloc28) R.vm.kill_object(reshape397) R.vm.kill_object(reshape398) R.vm.kill_object(reshape399) gv51: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape400: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc28, gv51, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc28) gv52: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc29: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv52, R.dtype("float16")) _27: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape400, alloc29) R.vm.kill_object(reshape400) gv53: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape401: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc29, gv53, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc29) gv54: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape402: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape401, gv54, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape401) model_decoder_layers_1_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] model_decoder_layers_1_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[519] gv55: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc30: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv55, R.dtype("float16")) _28: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_out_proj_weight2, reshape402, model_decoder_layers_1_self_attn_out_proj_bias2, alloc30) R.vm.kill_object(reshape402) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias2) gv56: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc31: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv56, R.dtype("float16")) cls.add5(alloc23, alloc30, alloc31) R.vm.kill_object(alloc23) R.vm.kill_object(alloc30) model_decoder_layers_1_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[529] model_decoder_layers_1_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[530] gv57: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc32: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv57, R.dtype("float16")) cls.layer_norm2(alloc31, model_decoder_layers_1_encoder_attn_layer_norm_weight2, model_decoder_layers_1_encoder_attn_layer_norm_bias2, alloc32) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias2) model_decoder_layers_1_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] model_decoder_layers_1_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[526] gv58: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc33: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv58, R.dtype("float16")) _31: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight2, alloc32, model_decoder_layers_1_encoder_attn_q_proj_bias2, alloc33) R.vm.kill_object(alloc32) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias2) gv59: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape403: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc33, gv59, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc33) gv60: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape404: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape403, gv60, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape403) gv61: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc34: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv61, R.dtype("float16")) _32: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape404, alloc34) R.vm.kill_object(reshape404) gv62: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape405: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc34, gv62, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc34) gv63: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape406: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape405, gv63, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape405) model_decoder_layers_1_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] model_decoder_layers_1_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[528] gv64: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc35: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv64, R.dtype("float16")) _33: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight2, reshape406, model_decoder_layers_1_encoder_attn_out_proj_bias2, alloc35) R.vm.kill_object(reshape406) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias2) gv65: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc36: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv65, R.dtype("float16")) cls.add5(alloc31, alloc35, alloc36) R.vm.kill_object(alloc31) R.vm.kill_object(alloc35) model_decoder_layers_1_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[535] model_decoder_layers_1_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[536] gv66: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc37: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv66, R.dtype("float16")) cls.layer_norm2(alloc36, model_decoder_layers_1_final_layer_norm_weight2, model_decoder_layers_1_final_layer_norm_bias2, alloc37) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias2) model_decoder_layers_1_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] model_decoder_layers_1_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[532] gv67: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc38: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv67, R.dtype("float16")) _36: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_1_fc1_weight2, alloc37, model_decoder_layers_1_fc1_bias2, alloc38) R.vm.kill_object(alloc37) R.vm.kill_object(model_decoder_layers_1_fc1_weight2) R.vm.kill_object(model_decoder_layers_1_fc1_bias2) model_decoder_layers_1_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] model_decoder_layers_1_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[534] gv68: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc39: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv68, R.dtype("float16")) _37: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_1_fc2_weight2, alloc38, model_decoder_layers_1_fc2_bias2, alloc39) R.vm.kill_object(alloc38) R.vm.kill_object(model_decoder_layers_1_fc2_weight2) R.vm.kill_object(model_decoder_layers_1_fc2_bias2) gv69: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc40: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv69, R.dtype("float16")) cls.add5(alloc36, alloc39, alloc40) R.vm.kill_object(alloc36) R.vm.kill_object(alloc39) model_decoder_layers_2_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[544] model_decoder_layers_2_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[545] gv70: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc41: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv70, R.dtype("float16")) cls.layer_norm2(alloc40, model_decoder_layers_2_self_attn_layer_norm_weight2, model_decoder_layers_2_self_attn_layer_norm_bias2, alloc41) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias2) model_decoder_layers_2_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] model_decoder_layers_2_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[541] gv71: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc42: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv71, R.dtype("float16")) _40: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_q_proj_weight2, alloc41, model_decoder_layers_2_self_attn_q_proj_bias2, alloc42) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias2) gv72: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape407: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc42, gv72, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc42) model_decoder_layers_2_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] gv73: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc43: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv73, R.dtype("float16")) _41: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_2_self_attn_k_proj_weight2, alloc41, alloc43) R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight2) gv74: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape408: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc43, gv74, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc43) model_decoder_layers_2_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] model_decoder_layers_2_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[539] gv75: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc44: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv75, R.dtype("float16")) _42: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_v_proj_weight2, alloc41, model_decoder_layers_2_self_attn_v_proj_bias2, alloc44) R.vm.kill_object(alloc41) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias2) gv76: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape409: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc44, gv76, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc44) gv77: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc45: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv77, R.dtype("float16")) cls.concatenate1(reshape407, reshape408, reshape409, alloc45) R.vm.kill_object(reshape407) R.vm.kill_object(reshape408) R.vm.kill_object(reshape409) gv78: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape410: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc45, gv78, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc45) gv79: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc46: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv79, R.dtype("float16")) _44: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape410, alloc46) R.vm.kill_object(reshape410) gv80: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape411: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc46, gv80, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc46) gv81: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape412: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape411, gv81, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape411) model_decoder_layers_2_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] model_decoder_layers_2_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[543] gv82: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc47: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv82, R.dtype("float16")) _45: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_out_proj_weight2, reshape412, model_decoder_layers_2_self_attn_out_proj_bias2, alloc47) R.vm.kill_object(reshape412) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias2) gv83: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc48: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv83, R.dtype("float16")) cls.add5(alloc40, alloc47, alloc48) R.vm.kill_object(alloc40) R.vm.kill_object(alloc47) model_decoder_layers_2_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[553] model_decoder_layers_2_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[554] gv84: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc49: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv84, R.dtype("float16")) cls.layer_norm2(alloc48, model_decoder_layers_2_encoder_attn_layer_norm_weight2, model_decoder_layers_2_encoder_attn_layer_norm_bias2, alloc49) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias2) model_decoder_layers_2_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] model_decoder_layers_2_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[550] gv85: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc50: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv85, R.dtype("float16")) _48: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight2, alloc49, model_decoder_layers_2_encoder_attn_q_proj_bias2, alloc50) R.vm.kill_object(alloc49) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias2) gv86: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape413: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc50, gv86, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc50) gv87: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape414: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape413, gv87, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape413) gv88: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc51: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv88, R.dtype("float16")) _49: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape414, alloc51) R.vm.kill_object(reshape414) gv89: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape415: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc51, gv89, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc51) gv90: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape416: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape415, gv90, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape415) model_decoder_layers_2_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] model_decoder_layers_2_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[552] gv91: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc52: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv91, R.dtype("float16")) _50: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight2, reshape416, model_decoder_layers_2_encoder_attn_out_proj_bias2, alloc52) R.vm.kill_object(reshape416) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias2) gv92: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc53: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv92, R.dtype("float16")) cls.add5(alloc48, alloc52, alloc53) R.vm.kill_object(alloc48) R.vm.kill_object(alloc52) model_decoder_layers_2_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[559] model_decoder_layers_2_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[560] gv93: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc54: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv93, R.dtype("float16")) cls.layer_norm2(alloc53, model_decoder_layers_2_final_layer_norm_weight2, model_decoder_layers_2_final_layer_norm_bias2, alloc54) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias2) model_decoder_layers_2_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] model_decoder_layers_2_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[556] gv94: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc55: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv94, R.dtype("float16")) _53: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_2_fc1_weight2, alloc54, model_decoder_layers_2_fc1_bias2, alloc55) R.vm.kill_object(alloc54) R.vm.kill_object(model_decoder_layers_2_fc1_weight2) R.vm.kill_object(model_decoder_layers_2_fc1_bias2) model_decoder_layers_2_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] model_decoder_layers_2_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[558] gv95: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc56: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv95, R.dtype("float16")) _54: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_2_fc2_weight2, alloc55, model_decoder_layers_2_fc2_bias2, alloc56) R.vm.kill_object(alloc55) R.vm.kill_object(model_decoder_layers_2_fc2_weight2) R.vm.kill_object(model_decoder_layers_2_fc2_bias2) gv96: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc57: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv96, R.dtype("float16")) cls.add5(alloc53, alloc56, alloc57) R.vm.kill_object(alloc53) R.vm.kill_object(alloc56) model_decoder_layers_3_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[568] model_decoder_layers_3_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[569] gv97: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc58: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv97, R.dtype("float16")) cls.layer_norm2(alloc57, model_decoder_layers_3_self_attn_layer_norm_weight2, model_decoder_layers_3_self_attn_layer_norm_bias2, alloc58) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias2) model_decoder_layers_3_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] model_decoder_layers_3_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[565] gv98: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc59: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv98, R.dtype("float16")) _57: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_q_proj_weight2, alloc58, model_decoder_layers_3_self_attn_q_proj_bias2, alloc59) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias2) gv99: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape417: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc59, gv99, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc59) model_decoder_layers_3_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] gv100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc60: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv100, R.dtype("float16")) _58: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_3_self_attn_k_proj_weight2, alloc58, alloc60) R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight2) gv101: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape418: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc60, gv101, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc60) model_decoder_layers_3_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] model_decoder_layers_3_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[563] gv102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc61: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv102, R.dtype("float16")) _59: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_v_proj_weight2, alloc58, model_decoder_layers_3_self_attn_v_proj_bias2, alloc61) R.vm.kill_object(alloc58) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias2) gv103: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape419: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc61, gv103, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc61) gv104: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc62: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv104, R.dtype("float16")) cls.concatenate1(reshape417, reshape418, reshape419, alloc62) R.vm.kill_object(reshape417) R.vm.kill_object(reshape418) R.vm.kill_object(reshape419) gv105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape420: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc62, gv105, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc62) gv106: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc63: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv106, R.dtype("float16")) _61: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape420, alloc63) R.vm.kill_object(reshape420) gv107: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape421: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc63, gv107, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc63) gv108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape422: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape421, gv108, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape421) model_decoder_layers_3_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] model_decoder_layers_3_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[567] gv109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc64: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv109, R.dtype("float16")) _62: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_out_proj_weight2, reshape422, model_decoder_layers_3_self_attn_out_proj_bias2, alloc64) R.vm.kill_object(reshape422) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias2) gv110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc65: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv110, R.dtype("float16")) cls.add5(alloc57, alloc64, alloc65) R.vm.kill_object(alloc57) R.vm.kill_object(alloc64) model_decoder_layers_3_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[577] model_decoder_layers_3_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[578] gv111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc66: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv111, R.dtype("float16")) cls.layer_norm2(alloc65, model_decoder_layers_3_encoder_attn_layer_norm_weight2, model_decoder_layers_3_encoder_attn_layer_norm_bias2, alloc66) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias2) model_decoder_layers_3_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] model_decoder_layers_3_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[574] gv112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc67: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv112, R.dtype("float16")) _65: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight2, alloc66, model_decoder_layers_3_encoder_attn_q_proj_bias2, alloc67) R.vm.kill_object(alloc66) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias2) gv113: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape423: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc67, gv113, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc67) gv114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape424: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape423, gv114, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape423) gv115: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc68: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv115, R.dtype("float16")) _66: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape424, alloc68) R.vm.kill_object(reshape424) gv116: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape425: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc68, gv116, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc68) gv117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape426: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape425, gv117, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape425) model_decoder_layers_3_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] model_decoder_layers_3_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[576] gv118: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc69: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv118, R.dtype("float16")) _67: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight2, reshape426, model_decoder_layers_3_encoder_attn_out_proj_bias2, alloc69) R.vm.kill_object(reshape426) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias2) gv119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc70: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv119, R.dtype("float16")) cls.add5(alloc65, alloc69, alloc70) R.vm.kill_object(alloc65) R.vm.kill_object(alloc69) model_decoder_layers_3_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[583] model_decoder_layers_3_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[584] gv120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc71: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv120, R.dtype("float16")) cls.layer_norm2(alloc70, model_decoder_layers_3_final_layer_norm_weight2, model_decoder_layers_3_final_layer_norm_bias2, alloc71) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias2) model_decoder_layers_3_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] model_decoder_layers_3_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[580] gv121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc72: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv121, R.dtype("float16")) _70: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_3_fc1_weight2, alloc71, model_decoder_layers_3_fc1_bias2, alloc72) R.vm.kill_object(alloc71) R.vm.kill_object(model_decoder_layers_3_fc1_weight2) R.vm.kill_object(model_decoder_layers_3_fc1_bias2) model_decoder_layers_3_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] model_decoder_layers_3_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[582] gv122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc73: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv122, R.dtype("float16")) _71: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_3_fc2_weight2, alloc72, model_decoder_layers_3_fc2_bias2, alloc73) R.vm.kill_object(alloc72) R.vm.kill_object(model_decoder_layers_3_fc2_weight2) R.vm.kill_object(model_decoder_layers_3_fc2_bias2) gv123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc74: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv123, R.dtype("float16")) cls.add5(alloc70, alloc73, alloc74) R.vm.kill_object(alloc70) R.vm.kill_object(alloc73) model_decoder_layers_4_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[592] model_decoder_layers_4_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[593] gv124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc75: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv124, R.dtype("float16")) cls.layer_norm2(alloc74, model_decoder_layers_4_self_attn_layer_norm_weight2, model_decoder_layers_4_self_attn_layer_norm_bias2, alloc75) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias2) model_decoder_layers_4_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] model_decoder_layers_4_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[589] gv125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc76: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv125, R.dtype("float16")) _74: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_q_proj_weight2, alloc75, model_decoder_layers_4_self_attn_q_proj_bias2, alloc76) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias2) gv126: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape427: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc76, gv126, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc76) model_decoder_layers_4_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] gv127: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc77: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv127, R.dtype("float16")) _75: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_4_self_attn_k_proj_weight2, alloc75, alloc77) R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight2) gv128: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape428: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc77, gv128, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc77) model_decoder_layers_4_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] model_decoder_layers_4_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[587] gv129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc78: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv129, R.dtype("float16")) _76: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_v_proj_weight2, alloc75, model_decoder_layers_4_self_attn_v_proj_bias2, alloc78) R.vm.kill_object(alloc75) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias2) gv130: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape429: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc78, gv130, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc78) gv131: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc79: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv131, R.dtype("float16")) cls.concatenate1(reshape427, reshape428, reshape429, alloc79) R.vm.kill_object(reshape427) R.vm.kill_object(reshape428) R.vm.kill_object(reshape429) gv132: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape430: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc79, gv132, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc79) gv133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc80: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv133, R.dtype("float16")) _78: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape430, alloc80) R.vm.kill_object(reshape430) gv134: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape431: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc80, gv134, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc80) gv135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape432: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape431, gv135, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape431) model_decoder_layers_4_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] model_decoder_layers_4_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[591] gv136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc81: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv136, R.dtype("float16")) _79: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_out_proj_weight2, reshape432, model_decoder_layers_4_self_attn_out_proj_bias2, alloc81) R.vm.kill_object(reshape432) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias2) gv137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc82: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv137, R.dtype("float16")) cls.add5(alloc74, alloc81, alloc82) R.vm.kill_object(alloc74) R.vm.kill_object(alloc81) model_decoder_layers_4_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[601] model_decoder_layers_4_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[602] gv138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc83: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv138, R.dtype("float16")) cls.layer_norm2(alloc82, model_decoder_layers_4_encoder_attn_layer_norm_weight2, model_decoder_layers_4_encoder_attn_layer_norm_bias2, alloc83) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias2) model_decoder_layers_4_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] model_decoder_layers_4_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[598] gv139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc84: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv139, R.dtype("float16")) _82: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight2, alloc83, model_decoder_layers_4_encoder_attn_q_proj_bias2, alloc84) R.vm.kill_object(alloc83) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias2) gv140: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape433: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc84, gv140, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc84) gv141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape434: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape433, gv141, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape433) gv142: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc85: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv142, R.dtype("float16")) _83: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape434, alloc85) R.vm.kill_object(reshape434) gv143: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape435: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc85, gv143, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc85) gv144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape436: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape435, gv144, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape435) model_decoder_layers_4_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] model_decoder_layers_4_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[600] gv145: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc86: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv145, R.dtype("float16")) _84: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight2, reshape436, model_decoder_layers_4_encoder_attn_out_proj_bias2, alloc86) R.vm.kill_object(reshape436) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias2) gv146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc87: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv146, R.dtype("float16")) cls.add5(alloc82, alloc86, alloc87) R.vm.kill_object(alloc82) R.vm.kill_object(alloc86) model_decoder_layers_4_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[607] model_decoder_layers_4_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[608] gv147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc88: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv147, R.dtype("float16")) cls.layer_norm2(alloc87, model_decoder_layers_4_final_layer_norm_weight2, model_decoder_layers_4_final_layer_norm_bias2, alloc88) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias2) model_decoder_layers_4_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] model_decoder_layers_4_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[604] gv148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc89: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv148, R.dtype("float16")) _87: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_4_fc1_weight2, alloc88, model_decoder_layers_4_fc1_bias2, alloc89) R.vm.kill_object(alloc88) R.vm.kill_object(model_decoder_layers_4_fc1_weight2) R.vm.kill_object(model_decoder_layers_4_fc1_bias2) model_decoder_layers_4_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] model_decoder_layers_4_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[606] gv149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc90: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv149, R.dtype("float16")) _88: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_4_fc2_weight2, alloc89, model_decoder_layers_4_fc2_bias2, alloc90) R.vm.kill_object(alloc89) R.vm.kill_object(model_decoder_layers_4_fc2_weight2) R.vm.kill_object(model_decoder_layers_4_fc2_bias2) gv150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc91: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv150, R.dtype("float16")) cls.add5(alloc87, alloc90, alloc91) R.vm.kill_object(alloc87) R.vm.kill_object(alloc90) model_decoder_layers_5_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[616] model_decoder_layers_5_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[617] gv151: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc92: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv151, R.dtype("float16")) cls.layer_norm2(alloc91, model_decoder_layers_5_self_attn_layer_norm_weight2, model_decoder_layers_5_self_attn_layer_norm_bias2, alloc92) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias2) model_decoder_layers_5_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] model_decoder_layers_5_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[613] gv152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc93: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv152, R.dtype("float16")) _91: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_q_proj_weight2, alloc92, model_decoder_layers_5_self_attn_q_proj_bias2, alloc93) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias2) gv153: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape437: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc93, gv153, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc93) model_decoder_layers_5_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] gv154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc94: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv154, R.dtype("float16")) _92: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_5_self_attn_k_proj_weight2, alloc92, alloc94) R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight2) gv155: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape438: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc94, gv155, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc94) model_decoder_layers_5_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] model_decoder_layers_5_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[611] gv156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc95: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv156, R.dtype("float16")) _93: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_v_proj_weight2, alloc92, model_decoder_layers_5_self_attn_v_proj_bias2, alloc95) R.vm.kill_object(alloc92) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias2) gv157: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape439: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc95, gv157, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc95) gv158: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc96: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv158, R.dtype("float16")) cls.concatenate1(reshape437, reshape438, reshape439, alloc96) R.vm.kill_object(reshape437) R.vm.kill_object(reshape438) R.vm.kill_object(reshape439) gv159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape440: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc96, gv159, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc96) gv160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc97: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv160, R.dtype("float16")) _95: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape440, alloc97) R.vm.kill_object(reshape440) gv161: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape441: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc97, gv161, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc97) gv162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape442: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape441, gv162, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape441) model_decoder_layers_5_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] model_decoder_layers_5_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[615] gv163: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc98: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv163, R.dtype("float16")) _96: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_out_proj_weight2, reshape442, model_decoder_layers_5_self_attn_out_proj_bias2, alloc98) R.vm.kill_object(reshape442) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias2) gv164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc99: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv164, R.dtype("float16")) cls.add5(alloc91, alloc98, alloc99) R.vm.kill_object(alloc91) R.vm.kill_object(alloc98) model_decoder_layers_5_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[625] model_decoder_layers_5_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[626] gv165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv165, R.dtype("float16")) cls.layer_norm2(alloc99, model_decoder_layers_5_encoder_attn_layer_norm_weight2, model_decoder_layers_5_encoder_attn_layer_norm_bias2, alloc100) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias2) model_decoder_layers_5_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] model_decoder_layers_5_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[622] gv166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc101: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv166, R.dtype("float16")) _99: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight2, alloc100, model_decoder_layers_5_encoder_attn_q_proj_bias2, alloc101) R.vm.kill_object(alloc100) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias2) gv167: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape443: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc101, gv167, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc101) gv168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape444: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape443, gv168, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape443) gv169: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv169, R.dtype("float16")) _100: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape444, alloc102) R.vm.kill_object(reshape444) gv170: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape445: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc102, gv170, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc102) gv171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape446: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape445, gv171, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape445) model_decoder_layers_5_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] model_decoder_layers_5_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[624] gv172: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv172, R.dtype("float16")) _101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight2, reshape446, model_decoder_layers_5_encoder_attn_out_proj_bias2, alloc103) R.vm.kill_object(reshape446) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias2) gv173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv173, R.dtype("float16")) cls.add5(alloc99, alloc103, alloc104) R.vm.kill_object(alloc99) R.vm.kill_object(alloc103) model_decoder_layers_5_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[631] model_decoder_layers_5_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[632] gv174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv174, R.dtype("float16")) cls.layer_norm2(alloc104, model_decoder_layers_5_final_layer_norm_weight2, model_decoder_layers_5_final_layer_norm_bias2, alloc105) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias2) model_decoder_layers_5_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] model_decoder_layers_5_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[628] gv175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv175, R.dtype("float16")) _104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_5_fc1_weight2, alloc105, model_decoder_layers_5_fc1_bias2, alloc106) R.vm.kill_object(alloc105) R.vm.kill_object(model_decoder_layers_5_fc1_weight2) R.vm.kill_object(model_decoder_layers_5_fc1_bias2) model_decoder_layers_5_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] model_decoder_layers_5_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[630] gv176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv176, R.dtype("float16")) _105: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_5_fc2_weight2, alloc106, model_decoder_layers_5_fc2_bias2, alloc107) R.vm.kill_object(alloc106) R.vm.kill_object(model_decoder_layers_5_fc2_weight2) R.vm.kill_object(model_decoder_layers_5_fc2_bias2) gv177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc108: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv177, R.dtype("float16")) cls.add5(alloc104, alloc107, alloc108) R.vm.kill_object(alloc104) R.vm.kill_object(alloc107) model_decoder_layers_6_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[640] model_decoder_layers_6_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[641] gv178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv178, R.dtype("float16")) cls.layer_norm2(alloc108, model_decoder_layers_6_self_attn_layer_norm_weight2, model_decoder_layers_6_self_attn_layer_norm_bias2, alloc109) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias2) model_decoder_layers_6_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] model_decoder_layers_6_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[637] gv179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv179, R.dtype("float16")) _108: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_q_proj_weight2, alloc109, model_decoder_layers_6_self_attn_q_proj_bias2, alloc110) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias2) gv180: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape447: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc110, gv180, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc110) model_decoder_layers_6_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] gv181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv181, R.dtype("float16")) _109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_6_self_attn_k_proj_weight2, alloc109, alloc111) R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight2) gv182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape448: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc111, gv182, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc111) model_decoder_layers_6_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] model_decoder_layers_6_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[635] gv183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv183, R.dtype("float16")) _110: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_v_proj_weight2, alloc109, model_decoder_layers_6_self_attn_v_proj_bias2, alloc112) R.vm.kill_object(alloc109) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias2) gv184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape449: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc112, gv184, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc112) gv185: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc113: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv185, R.dtype("float16")) cls.concatenate1(reshape447, reshape448, reshape449, alloc113) R.vm.kill_object(reshape447) R.vm.kill_object(reshape448) R.vm.kill_object(reshape449) gv186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape450: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc113, gv186, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc113) gv187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv187, R.dtype("float16")) _112: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape450, alloc114) R.vm.kill_object(reshape450) gv188: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape451: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc114, gv188, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc114) gv189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape452: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape451, gv189, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape451) model_decoder_layers_6_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] model_decoder_layers_6_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[639] gv190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv190, R.dtype("float16")) _113: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_out_proj_weight2, reshape452, model_decoder_layers_6_self_attn_out_proj_bias2, alloc115) R.vm.kill_object(reshape452) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias2) gv191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv191, R.dtype("float16")) cls.add5(alloc108, alloc115, alloc116) R.vm.kill_object(alloc108) R.vm.kill_object(alloc115) model_decoder_layers_6_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[649] model_decoder_layers_6_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[650] gv192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv192, R.dtype("float16")) cls.layer_norm2(alloc116, model_decoder_layers_6_encoder_attn_layer_norm_weight2, model_decoder_layers_6_encoder_attn_layer_norm_bias2, alloc117) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias2) model_decoder_layers_6_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] model_decoder_layers_6_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[646] gv193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc118: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv193, R.dtype("float16")) _116: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight2, alloc117, model_decoder_layers_6_encoder_attn_q_proj_bias2, alloc118) R.vm.kill_object(alloc117) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias2) gv194: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape453: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc118, gv194, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc118) gv195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape454: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape453, gv195, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape453) gv196: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv196, R.dtype("float16")) _117: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape454, alloc119) R.vm.kill_object(reshape454) gv197: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape455: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc119, gv197, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc119) gv198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape456: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape455, gv198, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape455) model_decoder_layers_6_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] model_decoder_layers_6_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[648] gv199: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv199, R.dtype("float16")) _118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight2, reshape456, model_decoder_layers_6_encoder_attn_out_proj_bias2, alloc120) R.vm.kill_object(reshape456) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias2) gv200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv200, R.dtype("float16")) cls.add5(alloc116, alloc120, alloc121) R.vm.kill_object(alloc116) R.vm.kill_object(alloc120) model_decoder_layers_6_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[655] model_decoder_layers_6_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[656] gv201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv201, R.dtype("float16")) cls.layer_norm2(alloc121, model_decoder_layers_6_final_layer_norm_weight2, model_decoder_layers_6_final_layer_norm_bias2, alloc122) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias2) model_decoder_layers_6_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] model_decoder_layers_6_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[652] gv202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv202, R.dtype("float16")) _121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_6_fc1_weight2, alloc122, model_decoder_layers_6_fc1_bias2, alloc123) R.vm.kill_object(alloc122) R.vm.kill_object(model_decoder_layers_6_fc1_weight2) R.vm.kill_object(model_decoder_layers_6_fc1_bias2) model_decoder_layers_6_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] model_decoder_layers_6_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[654] gv203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv203, R.dtype("float16")) _122: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_6_fc2_weight2, alloc123, model_decoder_layers_6_fc2_bias2, alloc124) R.vm.kill_object(alloc123) R.vm.kill_object(model_decoder_layers_6_fc2_weight2) R.vm.kill_object(model_decoder_layers_6_fc2_bias2) gv204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc125: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv204, R.dtype("float16")) cls.add5(alloc121, alloc124, alloc125) R.vm.kill_object(alloc121) R.vm.kill_object(alloc124) model_decoder_layers_7_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[664] model_decoder_layers_7_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[665] gv205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv205, R.dtype("float16")) cls.layer_norm2(alloc125, model_decoder_layers_7_self_attn_layer_norm_weight2, model_decoder_layers_7_self_attn_layer_norm_bias2, alloc126) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias2) model_decoder_layers_7_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] model_decoder_layers_7_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[661] gv206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv206, R.dtype("float16")) _125: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_q_proj_weight2, alloc126, model_decoder_layers_7_self_attn_q_proj_bias2, alloc127) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias2) gv207: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape457: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc127, gv207, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc127) model_decoder_layers_7_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] gv208: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv208, R.dtype("float16")) _126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_7_self_attn_k_proj_weight2, alloc126, alloc128) R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight2) gv209: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape458: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc128, gv209, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc128) model_decoder_layers_7_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] model_decoder_layers_7_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[659] gv210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv210, R.dtype("float16")) _127: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_v_proj_weight2, alloc126, model_decoder_layers_7_self_attn_v_proj_bias2, alloc129) R.vm.kill_object(alloc126) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias2) gv211: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape459: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc129, gv211, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc129) gv212: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc130: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv212, R.dtype("float16")) cls.concatenate1(reshape457, reshape458, reshape459, alloc130) R.vm.kill_object(reshape457) R.vm.kill_object(reshape458) R.vm.kill_object(reshape459) gv213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape460: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc130, gv213, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc130) gv214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv214, R.dtype("float16")) _129: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape460, alloc131) R.vm.kill_object(reshape460) gv215: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape461: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc131, gv215, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc131) gv216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape462: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape461, gv216, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape461) model_decoder_layers_7_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] model_decoder_layers_7_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[663] gv217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv217, R.dtype("float16")) _130: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_out_proj_weight2, reshape462, model_decoder_layers_7_self_attn_out_proj_bias2, alloc132) R.vm.kill_object(reshape462) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias2) gv218: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv218, R.dtype("float16")) cls.add5(alloc125, alloc132, alloc133) R.vm.kill_object(alloc125) R.vm.kill_object(alloc132) model_decoder_layers_7_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[673] model_decoder_layers_7_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[674] gv219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv219, R.dtype("float16")) cls.layer_norm2(alloc133, model_decoder_layers_7_encoder_attn_layer_norm_weight2, model_decoder_layers_7_encoder_attn_layer_norm_bias2, alloc134) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias2) model_decoder_layers_7_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] model_decoder_layers_7_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[670] gv220: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc135: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv220, R.dtype("float16")) _133: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight2, alloc134, model_decoder_layers_7_encoder_attn_q_proj_bias2, alloc135) R.vm.kill_object(alloc134) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias2) gv221: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape463: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc135, gv221, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc135) gv222: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape464: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape463, gv222, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape463) gv223: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv223, R.dtype("float16")) _134: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape464, alloc136) R.vm.kill_object(reshape464) gv224: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape465: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc136, gv224, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc136) gv225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape466: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape465, gv225, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape465) model_decoder_layers_7_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] model_decoder_layers_7_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[672] gv226: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv226, R.dtype("float16")) _135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight2, reshape466, model_decoder_layers_7_encoder_attn_out_proj_bias2, alloc137) R.vm.kill_object(reshape466) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias2) gv227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv227, R.dtype("float16")) cls.add5(alloc133, alloc137, alloc138) R.vm.kill_object(alloc133) R.vm.kill_object(alloc137) model_decoder_layers_7_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[679] model_decoder_layers_7_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[680] gv228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv228, R.dtype("float16")) cls.layer_norm2(alloc138, model_decoder_layers_7_final_layer_norm_weight2, model_decoder_layers_7_final_layer_norm_bias2, alloc139) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias2) model_decoder_layers_7_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] model_decoder_layers_7_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[676] gv229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv229, R.dtype("float16")) _138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_7_fc1_weight2, alloc139, model_decoder_layers_7_fc1_bias2, alloc140) R.vm.kill_object(alloc139) R.vm.kill_object(model_decoder_layers_7_fc1_weight2) R.vm.kill_object(model_decoder_layers_7_fc1_bias2) model_decoder_layers_7_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] model_decoder_layers_7_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[678] gv230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv230, R.dtype("float16")) _139: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_7_fc2_weight2, alloc140, model_decoder_layers_7_fc2_bias2, alloc141) R.vm.kill_object(alloc140) R.vm.kill_object(model_decoder_layers_7_fc2_weight2) R.vm.kill_object(model_decoder_layers_7_fc2_bias2) gv231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc142: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv231, R.dtype("float16")) cls.add5(alloc138, alloc141, alloc142) R.vm.kill_object(alloc138) R.vm.kill_object(alloc141) model_decoder_layers_8_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[688] model_decoder_layers_8_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[689] gv232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv232, R.dtype("float16")) cls.layer_norm2(alloc142, model_decoder_layers_8_self_attn_layer_norm_weight2, model_decoder_layers_8_self_attn_layer_norm_bias2, alloc143) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias2) model_decoder_layers_8_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] model_decoder_layers_8_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[685] gv233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv233, R.dtype("float16")) _142: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_q_proj_weight2, alloc143, model_decoder_layers_8_self_attn_q_proj_bias2, alloc144) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias2) gv234: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape467: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc144, gv234, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc144) model_decoder_layers_8_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] gv235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv235, R.dtype("float16")) _143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_8_self_attn_k_proj_weight2, alloc143, alloc145) R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight2) gv236: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape468: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc145, gv236, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc145) model_decoder_layers_8_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] model_decoder_layers_8_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[683] gv237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv237, R.dtype("float16")) _144: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_v_proj_weight2, alloc143, model_decoder_layers_8_self_attn_v_proj_bias2, alloc146) R.vm.kill_object(alloc143) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias2) gv238: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape469: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc146, gv238, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc146) gv239: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc147: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv239, R.dtype("float16")) cls.concatenate1(reshape467, reshape468, reshape469, alloc147) R.vm.kill_object(reshape467) R.vm.kill_object(reshape468) R.vm.kill_object(reshape469) gv240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape470: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc147, gv240, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc147) gv241: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv241, R.dtype("float16")) _146: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape470, alloc148) R.vm.kill_object(reshape470) gv242: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape471: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc148, gv242, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc148) gv243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape472: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape471, gv243, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape471) model_decoder_layers_8_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] model_decoder_layers_8_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[687] gv244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv244, R.dtype("float16")) _147: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_out_proj_weight2, reshape472, model_decoder_layers_8_self_attn_out_proj_bias2, alloc149) R.vm.kill_object(reshape472) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias2) gv245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv245, R.dtype("float16")) cls.add5(alloc142, alloc149, alloc150) R.vm.kill_object(alloc142) R.vm.kill_object(alloc149) model_decoder_layers_8_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[697] model_decoder_layers_8_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[698] gv246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv246, R.dtype("float16")) cls.layer_norm2(alloc150, model_decoder_layers_8_encoder_attn_layer_norm_weight2, model_decoder_layers_8_encoder_attn_layer_norm_bias2, alloc151) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias2) model_decoder_layers_8_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] model_decoder_layers_8_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[694] gv247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc152: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv247, R.dtype("float16")) _150: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight2, alloc151, model_decoder_layers_8_encoder_attn_q_proj_bias2, alloc152) R.vm.kill_object(alloc151) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias2) gv248: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape473: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc152, gv248, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc152) gv249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape474: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape473, gv249, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape473) gv250: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv250, R.dtype("float16")) _151: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape474, alloc153) R.vm.kill_object(reshape474) gv251: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape475: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc153, gv251, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc153) gv252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape476: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape475, gv252, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape475) model_decoder_layers_8_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] model_decoder_layers_8_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[696] gv253: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv253, R.dtype("float16")) _152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight2, reshape476, model_decoder_layers_8_encoder_attn_out_proj_bias2, alloc154) R.vm.kill_object(reshape476) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias2) gv254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv254, R.dtype("float16")) cls.add5(alloc150, alloc154, alloc155) R.vm.kill_object(alloc150) R.vm.kill_object(alloc154) model_decoder_layers_8_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[703] model_decoder_layers_8_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[704] gv255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv255, R.dtype("float16")) cls.layer_norm2(alloc155, model_decoder_layers_8_final_layer_norm_weight2, model_decoder_layers_8_final_layer_norm_bias2, alloc156) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias2) model_decoder_layers_8_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] model_decoder_layers_8_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[700] gv256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv256, R.dtype("float16")) _155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_8_fc1_weight2, alloc156, model_decoder_layers_8_fc1_bias2, alloc157) R.vm.kill_object(alloc156) R.vm.kill_object(model_decoder_layers_8_fc1_weight2) R.vm.kill_object(model_decoder_layers_8_fc1_bias2) model_decoder_layers_8_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] model_decoder_layers_8_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[702] gv257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv257, R.dtype("float16")) _156: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_8_fc2_weight2, alloc157, model_decoder_layers_8_fc2_bias2, alloc158) R.vm.kill_object(alloc157) R.vm.kill_object(model_decoder_layers_8_fc2_weight2) R.vm.kill_object(model_decoder_layers_8_fc2_bias2) gv258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc159: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv258, R.dtype("float16")) cls.add5(alloc155, alloc158, alloc159) R.vm.kill_object(alloc155) R.vm.kill_object(alloc158) model_decoder_layers_9_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[712] model_decoder_layers_9_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[713] gv259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv259, R.dtype("float16")) cls.layer_norm2(alloc159, model_decoder_layers_9_self_attn_layer_norm_weight2, model_decoder_layers_9_self_attn_layer_norm_bias2, alloc160) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias2) model_decoder_layers_9_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] model_decoder_layers_9_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[709] gv260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv260, R.dtype("float16")) _159: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_q_proj_weight2, alloc160, model_decoder_layers_9_self_attn_q_proj_bias2, alloc161) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias2) gv261: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape477: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc161, gv261, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc161) model_decoder_layers_9_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] gv262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv262, R.dtype("float16")) _160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_9_self_attn_k_proj_weight2, alloc160, alloc162) R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight2) gv263: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape478: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc162, gv263, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc162) model_decoder_layers_9_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] model_decoder_layers_9_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[707] gv264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv264, R.dtype("float16")) _161: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_v_proj_weight2, alloc160, model_decoder_layers_9_self_attn_v_proj_bias2, alloc163) R.vm.kill_object(alloc160) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias2) gv265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape479: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc163, gv265, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc163) gv266: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc164: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv266, R.dtype("float16")) cls.concatenate1(reshape477, reshape478, reshape479, alloc164) R.vm.kill_object(reshape477) R.vm.kill_object(reshape478) R.vm.kill_object(reshape479) gv267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape480: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc164, gv267, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc164) gv268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv268, R.dtype("float16")) _163: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape480, alloc165) R.vm.kill_object(reshape480) gv269: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape481: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc165, gv269, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc165) gv270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape482: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape481, gv270, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape481) model_decoder_layers_9_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] model_decoder_layers_9_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[711] gv271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc166: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv271, R.dtype("float16")) _164: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_out_proj_weight2, reshape482, model_decoder_layers_9_self_attn_out_proj_bias2, alloc166) R.vm.kill_object(reshape482) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias2) gv272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc167: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv272, R.dtype("float16")) cls.add5(alloc159, alloc166, alloc167) R.vm.kill_object(alloc159) R.vm.kill_object(alloc166) model_decoder_layers_9_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[721] model_decoder_layers_9_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[722] gv273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc168: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv273, R.dtype("float16")) cls.layer_norm2(alloc167, model_decoder_layers_9_encoder_attn_layer_norm_weight2, model_decoder_layers_9_encoder_attn_layer_norm_bias2, alloc168) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias2) model_decoder_layers_9_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] model_decoder_layers_9_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[718] gv274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc169: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv274, R.dtype("float16")) _167: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight2, alloc168, model_decoder_layers_9_encoder_attn_q_proj_bias2, alloc169) R.vm.kill_object(alloc168) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias2) gv275: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape483: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc169, gv275, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc169) gv276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape484: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape483, gv276, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape483) gv277: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc170: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv277, R.dtype("float16")) _168: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape484, alloc170) R.vm.kill_object(reshape484) gv278: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape485: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc170, gv278, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc170) gv279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape486: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape485, gv279, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape485) model_decoder_layers_9_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] model_decoder_layers_9_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[720] gv280: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc171: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv280, R.dtype("float16")) _169: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight2, reshape486, model_decoder_layers_9_encoder_attn_out_proj_bias2, alloc171) R.vm.kill_object(reshape486) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias2) gv281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc172: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv281, R.dtype("float16")) cls.add5(alloc167, alloc171, alloc172) R.vm.kill_object(alloc167) R.vm.kill_object(alloc171) model_decoder_layers_9_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[727] model_decoder_layers_9_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[728] gv282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc173: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv282, R.dtype("float16")) cls.layer_norm2(alloc172, model_decoder_layers_9_final_layer_norm_weight2, model_decoder_layers_9_final_layer_norm_bias2, alloc173) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias2) model_decoder_layers_9_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] model_decoder_layers_9_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[724] gv283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc174: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv283, R.dtype("float16")) _172: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_9_fc1_weight2, alloc173, model_decoder_layers_9_fc1_bias2, alloc174) R.vm.kill_object(alloc173) R.vm.kill_object(model_decoder_layers_9_fc1_weight2) R.vm.kill_object(model_decoder_layers_9_fc1_bias2) model_decoder_layers_9_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] model_decoder_layers_9_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[726] gv284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc175: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv284, R.dtype("float16")) _173: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_9_fc2_weight2, alloc174, model_decoder_layers_9_fc2_bias2, alloc175) R.vm.kill_object(alloc174) R.vm.kill_object(model_decoder_layers_9_fc2_weight2) R.vm.kill_object(model_decoder_layers_9_fc2_bias2) gv285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc176: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv285, R.dtype("float16")) cls.add5(alloc172, alloc175, alloc176) R.vm.kill_object(alloc172) R.vm.kill_object(alloc175) model_decoder_layers_10_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[736] model_decoder_layers_10_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[737] gv286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc177: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv286, R.dtype("float16")) cls.layer_norm2(alloc176, model_decoder_layers_10_self_attn_layer_norm_weight2, model_decoder_layers_10_self_attn_layer_norm_bias2, alloc177) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias2) model_decoder_layers_10_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] model_decoder_layers_10_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[733] gv287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc178: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv287, R.dtype("float16")) _176: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_q_proj_weight2, alloc177, model_decoder_layers_10_self_attn_q_proj_bias2, alloc178) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias2) gv288: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape487: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc178, gv288, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc178) model_decoder_layers_10_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] gv289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc179: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv289, R.dtype("float16")) _177: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_10_self_attn_k_proj_weight2, alloc177, alloc179) R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight2) gv290: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape488: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc179, gv290, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc179) model_decoder_layers_10_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] model_decoder_layers_10_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[731] gv291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc180: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv291, R.dtype("float16")) _178: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_v_proj_weight2, alloc177, model_decoder_layers_10_self_attn_v_proj_bias2, alloc180) R.vm.kill_object(alloc177) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias2) gv292: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape489: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc180, gv292, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc180) gv293: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc181: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv293, R.dtype("float16")) cls.concatenate1(reshape487, reshape488, reshape489, alloc181) R.vm.kill_object(reshape487) R.vm.kill_object(reshape488) R.vm.kill_object(reshape489) gv294: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape490: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc181, gv294, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc181) gv295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc182: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv295, R.dtype("float16")) _180: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape490, alloc182) R.vm.kill_object(reshape490) gv296: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape491: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc182, gv296, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc182) gv297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape492: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape491, gv297, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape491) model_decoder_layers_10_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] model_decoder_layers_10_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[735] gv298: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc183: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv298, R.dtype("float16")) _181: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_out_proj_weight2, reshape492, model_decoder_layers_10_self_attn_out_proj_bias2, alloc183) R.vm.kill_object(reshape492) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias2) gv299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc184: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv299, R.dtype("float16")) cls.add5(alloc176, alloc183, alloc184) R.vm.kill_object(alloc176) R.vm.kill_object(alloc183) model_decoder_layers_10_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[745] model_decoder_layers_10_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[746] gv300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc185: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv300, R.dtype("float16")) cls.layer_norm2(alloc184, model_decoder_layers_10_encoder_attn_layer_norm_weight2, model_decoder_layers_10_encoder_attn_layer_norm_bias2, alloc185) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias2) model_decoder_layers_10_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] model_decoder_layers_10_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[742] gv301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc186: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv301, R.dtype("float16")) _184: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight2, alloc185, model_decoder_layers_10_encoder_attn_q_proj_bias2, alloc186) R.vm.kill_object(alloc185) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias2) gv302: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape493: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc186, gv302, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc186) gv303: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape494: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape493, gv303, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape493) gv304: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc187: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv304, R.dtype("float16")) _185: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape494, alloc187) R.vm.kill_object(reshape494) gv305: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape495: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc187, gv305, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc187) gv306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape496: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape495, gv306, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape495) model_decoder_layers_10_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] model_decoder_layers_10_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[744] gv307: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc188: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv307, R.dtype("float16")) _186: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight2, reshape496, model_decoder_layers_10_encoder_attn_out_proj_bias2, alloc188) R.vm.kill_object(reshape496) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias2) gv308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc189: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv308, R.dtype("float16")) cls.add5(alloc184, alloc188, alloc189) R.vm.kill_object(alloc184) R.vm.kill_object(alloc188) model_decoder_layers_10_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[751] model_decoder_layers_10_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[752] gv309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc190: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv309, R.dtype("float16")) cls.layer_norm2(alloc189, model_decoder_layers_10_final_layer_norm_weight2, model_decoder_layers_10_final_layer_norm_bias2, alloc190) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias2) model_decoder_layers_10_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] model_decoder_layers_10_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[748] gv310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc191: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv310, R.dtype("float16")) _189: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_10_fc1_weight2, alloc190, model_decoder_layers_10_fc1_bias2, alloc191) R.vm.kill_object(alloc190) R.vm.kill_object(model_decoder_layers_10_fc1_weight2) R.vm.kill_object(model_decoder_layers_10_fc1_bias2) model_decoder_layers_10_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] model_decoder_layers_10_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[750] gv311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc192: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv311, R.dtype("float16")) _190: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_10_fc2_weight2, alloc191, model_decoder_layers_10_fc2_bias2, alloc192) R.vm.kill_object(alloc191) R.vm.kill_object(model_decoder_layers_10_fc2_weight2) R.vm.kill_object(model_decoder_layers_10_fc2_bias2) gv312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc193: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv312, R.dtype("float16")) cls.add5(alloc189, alloc192, alloc193) R.vm.kill_object(alloc189) R.vm.kill_object(alloc192) model_decoder_layers_11_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[760] model_decoder_layers_11_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[761] gv313: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc194: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv313, R.dtype("float16")) cls.layer_norm2(alloc193, model_decoder_layers_11_self_attn_layer_norm_weight2, model_decoder_layers_11_self_attn_layer_norm_bias2, alloc194) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias2) model_decoder_layers_11_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] model_decoder_layers_11_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[757] gv314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc195: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv314, R.dtype("float16")) _193: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_q_proj_weight2, alloc194, model_decoder_layers_11_self_attn_q_proj_bias2, alloc195) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias2) gv315: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape497: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc195, gv315, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc195) model_decoder_layers_11_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] gv316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc196: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv316, R.dtype("float16")) _194: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_11_self_attn_k_proj_weight2, alloc194, alloc196) R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight2) gv317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape498: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc196, gv317, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc196) model_decoder_layers_11_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] model_decoder_layers_11_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[755] gv318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc197: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv318, R.dtype("float16")) _195: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_v_proj_weight2, alloc194, model_decoder_layers_11_self_attn_v_proj_bias2, alloc197) R.vm.kill_object(alloc194) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias2) gv319: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape499: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc197, gv319, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc197) gv320: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc198: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv320, R.dtype("float16")) cls.concatenate1(reshape497, reshape498, reshape499, alloc198) R.vm.kill_object(reshape497) R.vm.kill_object(reshape498) R.vm.kill_object(reshape499) gv321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape500: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc198, gv321, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc198) gv322: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc199: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv322, R.dtype("float16")) _197: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape500, alloc199) R.vm.kill_object(reshape500) gv323: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape501: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc199, gv323, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc199) gv324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape502: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape501, gv324, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape501) model_decoder_layers_11_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] model_decoder_layers_11_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[759] gv325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc200: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv325, R.dtype("float16")) _198: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_out_proj_weight2, reshape502, model_decoder_layers_11_self_attn_out_proj_bias2, alloc200) R.vm.kill_object(reshape502) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias2) gv326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc201: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv326, R.dtype("float16")) cls.add5(alloc193, alloc200, alloc201) R.vm.kill_object(alloc193) R.vm.kill_object(alloc200) model_decoder_layers_11_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[769] model_decoder_layers_11_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[770] gv327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc202: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv327, R.dtype("float16")) cls.layer_norm2(alloc201, model_decoder_layers_11_encoder_attn_layer_norm_weight2, model_decoder_layers_11_encoder_attn_layer_norm_bias2, alloc202) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias2) model_decoder_layers_11_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] model_decoder_layers_11_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[766] gv328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc203: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv328, R.dtype("float16")) _201: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight2, alloc202, model_decoder_layers_11_encoder_attn_q_proj_bias2, alloc203) R.vm.kill_object(alloc202) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias2) gv329: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape503: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc203, gv329, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc203) gv330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape504: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape503, gv330, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape503) gv331: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc204: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv331, R.dtype("float16")) _202: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape504, alloc204) R.vm.kill_object(reshape504) gv332: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape505: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc204, gv332, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc204) gv333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape506: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape505, gv333, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape505) model_decoder_layers_11_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] model_decoder_layers_11_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[768] gv334: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc205: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv334, R.dtype("float16")) _203: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight2, reshape506, model_decoder_layers_11_encoder_attn_out_proj_bias2, alloc205) R.vm.kill_object(reshape506) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias2) gv335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc206: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv335, R.dtype("float16")) cls.add5(alloc201, alloc205, alloc206) R.vm.kill_object(alloc201) R.vm.kill_object(alloc205) model_decoder_layers_11_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[775] model_decoder_layers_11_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[776] gv336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc207: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv336, R.dtype("float16")) cls.layer_norm2(alloc206, model_decoder_layers_11_final_layer_norm_weight2, model_decoder_layers_11_final_layer_norm_bias2, alloc207) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias2) model_decoder_layers_11_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] model_decoder_layers_11_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[772] gv337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc208: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv337, R.dtype("float16")) _206: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_11_fc1_weight2, alloc207, model_decoder_layers_11_fc1_bias2, alloc208) R.vm.kill_object(alloc207) R.vm.kill_object(model_decoder_layers_11_fc1_weight2) R.vm.kill_object(model_decoder_layers_11_fc1_bias2) model_decoder_layers_11_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] model_decoder_layers_11_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[774] gv338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc209: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv338, R.dtype("float16")) _207: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_11_fc2_weight2, alloc208, model_decoder_layers_11_fc2_bias2, alloc209) R.vm.kill_object(alloc208) R.vm.kill_object(model_decoder_layers_11_fc2_weight2) R.vm.kill_object(model_decoder_layers_11_fc2_bias2) gv339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc210: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv339, R.dtype("float16")) cls.add5(alloc206, alloc209, alloc210) R.vm.kill_object(alloc206) R.vm.kill_object(alloc209) model_decoder_layers_12_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[784] model_decoder_layers_12_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[785] gv340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc211: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv340, R.dtype("float16")) cls.layer_norm2(alloc210, model_decoder_layers_12_self_attn_layer_norm_weight2, model_decoder_layers_12_self_attn_layer_norm_bias2, alloc211) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias2) model_decoder_layers_12_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] model_decoder_layers_12_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[781] gv341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc212: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv341, R.dtype("float16")) _210: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_q_proj_weight2, alloc211, model_decoder_layers_12_self_attn_q_proj_bias2, alloc212) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias2) gv342: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape507: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc212, gv342, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc212) model_decoder_layers_12_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] gv343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc213: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv343, R.dtype("float16")) _211: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_12_self_attn_k_proj_weight2, alloc211, alloc213) R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight2) gv344: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape508: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc213, gv344, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc213) model_decoder_layers_12_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] model_decoder_layers_12_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[779] gv345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc214: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv345, R.dtype("float16")) _212: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_v_proj_weight2, alloc211, model_decoder_layers_12_self_attn_v_proj_bias2, alloc214) R.vm.kill_object(alloc211) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias2) gv346: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape509: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc214, gv346, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc214) gv347: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc215: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv347, R.dtype("float16")) cls.concatenate1(reshape507, reshape508, reshape509, alloc215) R.vm.kill_object(reshape507) R.vm.kill_object(reshape508) R.vm.kill_object(reshape509) gv348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape510: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc215, gv348, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc215) gv349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc216: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv349, R.dtype("float16")) _214: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape510, alloc216) R.vm.kill_object(reshape510) gv350: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape511: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc216, gv350, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc216) gv351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape512: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape511, gv351, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape511) model_decoder_layers_12_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] model_decoder_layers_12_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[783] gv352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc217: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv352, R.dtype("float16")) _215: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_out_proj_weight2, reshape512, model_decoder_layers_12_self_attn_out_proj_bias2, alloc217) R.vm.kill_object(reshape512) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias2) gv353: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc218: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv353, R.dtype("float16")) cls.add5(alloc210, alloc217, alloc218) R.vm.kill_object(alloc210) R.vm.kill_object(alloc217) model_decoder_layers_12_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[793] model_decoder_layers_12_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[794] gv354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc219: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv354, R.dtype("float16")) cls.layer_norm2(alloc218, model_decoder_layers_12_encoder_attn_layer_norm_weight2, model_decoder_layers_12_encoder_attn_layer_norm_bias2, alloc219) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias2) model_decoder_layers_12_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] model_decoder_layers_12_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[790] gv355: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc220: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv355, R.dtype("float16")) _218: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight2, alloc219, model_decoder_layers_12_encoder_attn_q_proj_bias2, alloc220) R.vm.kill_object(alloc219) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias2) gv356: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape513: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc220, gv356, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc220) gv357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape514: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape513, gv357, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape513) gv358: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc221: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv358, R.dtype("float16")) _219: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape514, alloc221) R.vm.kill_object(reshape514) gv359: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape515: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc221, gv359, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc221) gv360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape516: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape515, gv360, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape515) model_decoder_layers_12_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] model_decoder_layers_12_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[792] gv361: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc222: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv361, R.dtype("float16")) _220: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight2, reshape516, model_decoder_layers_12_encoder_attn_out_proj_bias2, alloc222) R.vm.kill_object(reshape516) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias2) gv362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc223: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv362, R.dtype("float16")) cls.add5(alloc218, alloc222, alloc223) R.vm.kill_object(alloc218) R.vm.kill_object(alloc222) model_decoder_layers_12_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[799] model_decoder_layers_12_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[800] gv363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc224: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv363, R.dtype("float16")) cls.layer_norm2(alloc223, model_decoder_layers_12_final_layer_norm_weight2, model_decoder_layers_12_final_layer_norm_bias2, alloc224) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias2) model_decoder_layers_12_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] model_decoder_layers_12_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[796] gv364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc225: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv364, R.dtype("float16")) _223: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_12_fc1_weight2, alloc224, model_decoder_layers_12_fc1_bias2, alloc225) R.vm.kill_object(alloc224) R.vm.kill_object(model_decoder_layers_12_fc1_weight2) R.vm.kill_object(model_decoder_layers_12_fc1_bias2) model_decoder_layers_12_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] model_decoder_layers_12_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[798] gv365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc226: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv365, R.dtype("float16")) _224: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_12_fc2_weight2, alloc225, model_decoder_layers_12_fc2_bias2, alloc226) R.vm.kill_object(alloc225) R.vm.kill_object(model_decoder_layers_12_fc2_weight2) R.vm.kill_object(model_decoder_layers_12_fc2_bias2) gv366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc227: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv366, R.dtype("float16")) cls.add5(alloc223, alloc226, alloc227) R.vm.kill_object(alloc223) R.vm.kill_object(alloc226) model_decoder_layers_13_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[808] model_decoder_layers_13_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[809] gv367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc228: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv367, R.dtype("float16")) cls.layer_norm2(alloc227, model_decoder_layers_13_self_attn_layer_norm_weight2, model_decoder_layers_13_self_attn_layer_norm_bias2, alloc228) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias2) model_decoder_layers_13_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] model_decoder_layers_13_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[805] gv368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc229: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv368, R.dtype("float16")) _227: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_q_proj_weight2, alloc228, model_decoder_layers_13_self_attn_q_proj_bias2, alloc229) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias2) gv369: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape517: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc229, gv369, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc229) model_decoder_layers_13_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] gv370: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc230: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv370, R.dtype("float16")) _228: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_13_self_attn_k_proj_weight2, alloc228, alloc230) R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight2) gv371: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape518: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc230, gv371, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc230) model_decoder_layers_13_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] model_decoder_layers_13_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[803] gv372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc231: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv372, R.dtype("float16")) _229: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_v_proj_weight2, alloc228, model_decoder_layers_13_self_attn_v_proj_bias2, alloc231) R.vm.kill_object(alloc228) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias2) gv373: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape519: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc231, gv373, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc231) gv374: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc232: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv374, R.dtype("float16")) cls.concatenate1(reshape517, reshape518, reshape519, alloc232) R.vm.kill_object(reshape517) R.vm.kill_object(reshape518) R.vm.kill_object(reshape519) gv375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape520: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc232, gv375, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc232) gv376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc233: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv376, R.dtype("float16")) _231: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape520, alloc233) R.vm.kill_object(reshape520) gv377: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape521: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc233, gv377, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc233) gv378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape522: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape521, gv378, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape521) model_decoder_layers_13_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] model_decoder_layers_13_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[807] gv379: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc234: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv379, R.dtype("float16")) _232: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_out_proj_weight2, reshape522, model_decoder_layers_13_self_attn_out_proj_bias2, alloc234) R.vm.kill_object(reshape522) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias2) gv380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc235: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv380, R.dtype("float16")) cls.add5(alloc227, alloc234, alloc235) R.vm.kill_object(alloc227) R.vm.kill_object(alloc234) model_decoder_layers_13_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[817] model_decoder_layers_13_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[818] gv381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc236: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv381, R.dtype("float16")) cls.layer_norm2(alloc235, model_decoder_layers_13_encoder_attn_layer_norm_weight2, model_decoder_layers_13_encoder_attn_layer_norm_bias2, alloc236) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias2) model_decoder_layers_13_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] model_decoder_layers_13_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[814] gv382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc237: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv382, R.dtype("float16")) _235: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight2, alloc236, model_decoder_layers_13_encoder_attn_q_proj_bias2, alloc237) R.vm.kill_object(alloc236) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias2) gv383: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape523: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc237, gv383, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc237) gv384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape524: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape523, gv384, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape523) gv385: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc238: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv385, R.dtype("float16")) _236: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape524, alloc238) R.vm.kill_object(reshape524) gv386: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape525: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc238, gv386, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc238) gv387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape526: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape525, gv387, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape525) model_decoder_layers_13_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] model_decoder_layers_13_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[816] gv388: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc239: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv388, R.dtype("float16")) _237: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight2, reshape526, model_decoder_layers_13_encoder_attn_out_proj_bias2, alloc239) R.vm.kill_object(reshape526) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias2) gv389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc240: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv389, R.dtype("float16")) cls.add5(alloc235, alloc239, alloc240) R.vm.kill_object(alloc235) R.vm.kill_object(alloc239) model_decoder_layers_13_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[823] model_decoder_layers_13_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[824] gv390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc241: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv390, R.dtype("float16")) cls.layer_norm2(alloc240, model_decoder_layers_13_final_layer_norm_weight2, model_decoder_layers_13_final_layer_norm_bias2, alloc241) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias2) model_decoder_layers_13_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] model_decoder_layers_13_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[820] gv391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc242: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv391, R.dtype("float16")) _240: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_13_fc1_weight2, alloc241, model_decoder_layers_13_fc1_bias2, alloc242) R.vm.kill_object(alloc241) R.vm.kill_object(model_decoder_layers_13_fc1_weight2) R.vm.kill_object(model_decoder_layers_13_fc1_bias2) model_decoder_layers_13_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] model_decoder_layers_13_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[822] gv392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc243: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv392, R.dtype("float16")) _241: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_13_fc2_weight2, alloc242, model_decoder_layers_13_fc2_bias2, alloc243) R.vm.kill_object(alloc242) R.vm.kill_object(model_decoder_layers_13_fc2_weight2) R.vm.kill_object(model_decoder_layers_13_fc2_bias2) gv393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc244: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv393, R.dtype("float16")) cls.add5(alloc240, alloc243, alloc244) R.vm.kill_object(alloc240) R.vm.kill_object(alloc243) model_decoder_layers_14_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[832] model_decoder_layers_14_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[833] gv394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc245: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv394, R.dtype("float16")) cls.layer_norm2(alloc244, model_decoder_layers_14_self_attn_layer_norm_weight2, model_decoder_layers_14_self_attn_layer_norm_bias2, alloc245) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias2) model_decoder_layers_14_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] model_decoder_layers_14_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[829] gv395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc246: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv395, R.dtype("float16")) _244: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_q_proj_weight2, alloc245, model_decoder_layers_14_self_attn_q_proj_bias2, alloc246) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias2) gv396: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape527: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc246, gv396, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc246) model_decoder_layers_14_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] gv397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc247: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv397, R.dtype("float16")) _245: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_14_self_attn_k_proj_weight2, alloc245, alloc247) R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight2) gv398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape528: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc247, gv398, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc247) model_decoder_layers_14_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] model_decoder_layers_14_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[827] gv399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc248: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv399, R.dtype("float16")) _246: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_v_proj_weight2, alloc245, model_decoder_layers_14_self_attn_v_proj_bias2, alloc248) R.vm.kill_object(alloc245) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias2) gv400: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape529: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc248, gv400, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc248) gv401: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc249: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv401, R.dtype("float16")) cls.concatenate1(reshape527, reshape528, reshape529, alloc249) R.vm.kill_object(reshape527) R.vm.kill_object(reshape528) R.vm.kill_object(reshape529) gv402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape530: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc249, gv402, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc249) gv403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc250: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv403, R.dtype("float16")) _248: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape530, alloc250) R.vm.kill_object(reshape530) gv404: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape531: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc250, gv404, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc250) gv405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape532: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape531, gv405, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape531) model_decoder_layers_14_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] model_decoder_layers_14_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[831] gv406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc251: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv406, R.dtype("float16")) _249: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_out_proj_weight2, reshape532, model_decoder_layers_14_self_attn_out_proj_bias2, alloc251) R.vm.kill_object(reshape532) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias2) gv407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc252: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv407, R.dtype("float16")) cls.add5(alloc244, alloc251, alloc252) R.vm.kill_object(alloc244) R.vm.kill_object(alloc251) model_decoder_layers_14_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[841] model_decoder_layers_14_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[842] gv408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc253: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv408, R.dtype("float16")) cls.layer_norm2(alloc252, model_decoder_layers_14_encoder_attn_layer_norm_weight2, model_decoder_layers_14_encoder_attn_layer_norm_bias2, alloc253) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias2) model_decoder_layers_14_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] model_decoder_layers_14_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[838] gv409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc254: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv409, R.dtype("float16")) _252: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight2, alloc253, model_decoder_layers_14_encoder_attn_q_proj_bias2, alloc254) R.vm.kill_object(alloc253) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias2) gv410: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape533: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc254, gv410, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc254) gv411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape534: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape533, gv411, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape533) gv412: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc255: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv412, R.dtype("float16")) _253: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape534, alloc255) R.vm.kill_object(reshape534) gv413: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape535: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc255, gv413, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc255) gv414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape536: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape535, gv414, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape535) model_decoder_layers_14_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] model_decoder_layers_14_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[840] gv415: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc256: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv415, R.dtype("float16")) _254: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight2, reshape536, model_decoder_layers_14_encoder_attn_out_proj_bias2, alloc256) R.vm.kill_object(reshape536) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias2) gv416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc257: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv416, R.dtype("float16")) cls.add5(alloc252, alloc256, alloc257) R.vm.kill_object(alloc252) R.vm.kill_object(alloc256) model_decoder_layers_14_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[847] model_decoder_layers_14_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[848] gv417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc258: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv417, R.dtype("float16")) cls.layer_norm2(alloc257, model_decoder_layers_14_final_layer_norm_weight2, model_decoder_layers_14_final_layer_norm_bias2, alloc258) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias2) model_decoder_layers_14_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] model_decoder_layers_14_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[844] gv418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc259: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv418, R.dtype("float16")) _257: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_14_fc1_weight2, alloc258, model_decoder_layers_14_fc1_bias2, alloc259) R.vm.kill_object(alloc258) R.vm.kill_object(model_decoder_layers_14_fc1_weight2) R.vm.kill_object(model_decoder_layers_14_fc1_bias2) model_decoder_layers_14_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] model_decoder_layers_14_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[846] gv419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc260: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv419, R.dtype("float16")) _258: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_14_fc2_weight2, alloc259, model_decoder_layers_14_fc2_bias2, alloc260) R.vm.kill_object(alloc259) R.vm.kill_object(model_decoder_layers_14_fc2_weight2) R.vm.kill_object(model_decoder_layers_14_fc2_bias2) gv420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc261: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv420, R.dtype("float16")) cls.add5(alloc257, alloc260, alloc261) R.vm.kill_object(alloc257) R.vm.kill_object(alloc260) model_decoder_layers_15_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[856] model_decoder_layers_15_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[857] gv421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc262: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv421, R.dtype("float16")) cls.layer_norm2(alloc261, model_decoder_layers_15_self_attn_layer_norm_weight2, model_decoder_layers_15_self_attn_layer_norm_bias2, alloc262) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias2) model_decoder_layers_15_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] model_decoder_layers_15_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[853] gv422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc263: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv422, R.dtype("float16")) _261: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_q_proj_weight2, alloc262, model_decoder_layers_15_self_attn_q_proj_bias2, alloc263) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias2) gv423: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape537: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc263, gv423, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc263) model_decoder_layers_15_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] gv424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc264: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv424, R.dtype("float16")) _262: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_15_self_attn_k_proj_weight2, alloc262, alloc264) R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight2) gv425: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape538: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc264, gv425, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc264) model_decoder_layers_15_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] model_decoder_layers_15_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[851] gv426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc265: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv426, R.dtype("float16")) _263: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_v_proj_weight2, alloc262, model_decoder_layers_15_self_attn_v_proj_bias2, alloc265) R.vm.kill_object(alloc262) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias2) gv427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape539: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc265, gv427, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc265) gv428: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc266: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv428, R.dtype("float16")) cls.concatenate1(reshape537, reshape538, reshape539, alloc266) R.vm.kill_object(reshape537) R.vm.kill_object(reshape538) R.vm.kill_object(reshape539) gv429: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape540: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc266, gv429, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc266) gv430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc267: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv430, R.dtype("float16")) _265: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape540, alloc267) R.vm.kill_object(reshape540) gv431: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape541: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc267, gv431, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc267) gv432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape542: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape541, gv432, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape541) model_decoder_layers_15_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] model_decoder_layers_15_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[855] gv433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc268: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv433, R.dtype("float16")) _266: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_out_proj_weight2, reshape542, model_decoder_layers_15_self_attn_out_proj_bias2, alloc268) R.vm.kill_object(reshape542) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias2) gv434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc269: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv434, R.dtype("float16")) cls.add5(alloc261, alloc268, alloc269) R.vm.kill_object(alloc261) R.vm.kill_object(alloc268) model_decoder_layers_15_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[865] model_decoder_layers_15_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[866] gv435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc270: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv435, R.dtype("float16")) cls.layer_norm2(alloc269, model_decoder_layers_15_encoder_attn_layer_norm_weight2, model_decoder_layers_15_encoder_attn_layer_norm_bias2, alloc270) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias2) model_decoder_layers_15_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] model_decoder_layers_15_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[862] gv436: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc271: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv436, R.dtype("float16")) _269: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight2, alloc270, model_decoder_layers_15_encoder_attn_q_proj_bias2, alloc271) R.vm.kill_object(alloc270) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias2) gv437: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape543: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc271, gv437, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc271) gv438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape544: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape543, gv438, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape543) gv439: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc272: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv439, R.dtype("float16")) _270: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape544, alloc272) R.vm.kill_object(reshape544) gv440: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape545: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc272, gv440, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc272) gv441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape546: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape545, gv441, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape545) model_decoder_layers_15_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] model_decoder_layers_15_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[864] gv442: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc273: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv442, R.dtype("float16")) _271: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight2, reshape546, model_decoder_layers_15_encoder_attn_out_proj_bias2, alloc273) R.vm.kill_object(reshape546) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias2) gv443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc274: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv443, R.dtype("float16")) cls.add5(alloc269, alloc273, alloc274) R.vm.kill_object(alloc269) R.vm.kill_object(alloc273) model_decoder_layers_15_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[871] model_decoder_layers_15_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[872] gv444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc275: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv444, R.dtype("float16")) cls.layer_norm2(alloc274, model_decoder_layers_15_final_layer_norm_weight2, model_decoder_layers_15_final_layer_norm_bias2, alloc275) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias2) model_decoder_layers_15_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] model_decoder_layers_15_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[868] gv445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc276: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv445, R.dtype("float16")) _274: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_15_fc1_weight2, alloc275, model_decoder_layers_15_fc1_bias2, alloc276) R.vm.kill_object(alloc275) R.vm.kill_object(model_decoder_layers_15_fc1_weight2) R.vm.kill_object(model_decoder_layers_15_fc1_bias2) model_decoder_layers_15_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] model_decoder_layers_15_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[870] gv446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc277: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv446, R.dtype("float16")) _275: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_15_fc2_weight2, alloc276, model_decoder_layers_15_fc2_bias2, alloc277) R.vm.kill_object(alloc276) R.vm.kill_object(model_decoder_layers_15_fc2_weight2) R.vm.kill_object(model_decoder_layers_15_fc2_bias2) gv447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc278: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv447, R.dtype("float16")) cls.add5(alloc274, alloc277, alloc278) R.vm.kill_object(alloc274) R.vm.kill_object(alloc277) model_decoder_layers_16_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[880] model_decoder_layers_16_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[881] gv448: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc279: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv448, R.dtype("float16")) cls.layer_norm2(alloc278, model_decoder_layers_16_self_attn_layer_norm_weight2, model_decoder_layers_16_self_attn_layer_norm_bias2, alloc279) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias2) model_decoder_layers_16_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] model_decoder_layers_16_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[877] gv449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc280: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv449, R.dtype("float16")) _278: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_q_proj_weight2, alloc279, model_decoder_layers_16_self_attn_q_proj_bias2, alloc280) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias2) gv450: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape547: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc280, gv450, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc280) model_decoder_layers_16_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] gv451: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc281: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv451, R.dtype("float16")) _279: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_16_self_attn_k_proj_weight2, alloc279, alloc281) R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight2) gv452: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape548: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc281, gv452, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc281) model_decoder_layers_16_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] model_decoder_layers_16_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[875] gv453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc282: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv453, R.dtype("float16")) _280: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_v_proj_weight2, alloc279, model_decoder_layers_16_self_attn_v_proj_bias2, alloc282) R.vm.kill_object(alloc279) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias2) gv454: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape549: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc282, gv454, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc282) gv455: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc283: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv455, R.dtype("float16")) cls.concatenate1(reshape547, reshape548, reshape549, alloc283) R.vm.kill_object(reshape547) R.vm.kill_object(reshape548) R.vm.kill_object(reshape549) gv456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape550: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc283, gv456, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc283) gv457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc284: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv457, R.dtype("float16")) _282: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape550, alloc284) R.vm.kill_object(reshape550) gv458: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape551: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc284, gv458, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc284) gv459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape552: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape551, gv459, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape551) model_decoder_layers_16_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] model_decoder_layers_16_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[879] gv460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc285: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv460, R.dtype("float16")) _283: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_out_proj_weight2, reshape552, model_decoder_layers_16_self_attn_out_proj_bias2, alloc285) R.vm.kill_object(reshape552) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias2) gv461: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc286: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv461, R.dtype("float16")) cls.add5(alloc278, alloc285, alloc286) R.vm.kill_object(alloc278) R.vm.kill_object(alloc285) model_decoder_layers_16_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[889] model_decoder_layers_16_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[890] gv462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc287: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv462, R.dtype("float16")) cls.layer_norm2(alloc286, model_decoder_layers_16_encoder_attn_layer_norm_weight2, model_decoder_layers_16_encoder_attn_layer_norm_bias2, alloc287) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias2) model_decoder_layers_16_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] model_decoder_layers_16_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[886] gv463: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc288: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv463, R.dtype("float16")) _286: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight2, alloc287, model_decoder_layers_16_encoder_attn_q_proj_bias2, alloc288) R.vm.kill_object(alloc287) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias2) gv464: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape553: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc288, gv464, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc288) gv465: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape554: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape553, gv465, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape553) gv466: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc289: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv466, R.dtype("float16")) _287: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape554, alloc289) R.vm.kill_object(reshape554) gv467: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape555: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc289, gv467, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc289) gv468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape556: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape555, gv468, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape555) model_decoder_layers_16_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] model_decoder_layers_16_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[888] gv469: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc290: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv469, R.dtype("float16")) _288: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight2, reshape556, model_decoder_layers_16_encoder_attn_out_proj_bias2, alloc290) R.vm.kill_object(reshape556) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias2) gv470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc291: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv470, R.dtype("float16")) cls.add5(alloc286, alloc290, alloc291) R.vm.kill_object(alloc286) R.vm.kill_object(alloc290) model_decoder_layers_16_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[895] model_decoder_layers_16_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[896] gv471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc292: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv471, R.dtype("float16")) cls.layer_norm2(alloc291, model_decoder_layers_16_final_layer_norm_weight2, model_decoder_layers_16_final_layer_norm_bias2, alloc292) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias2) model_decoder_layers_16_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] model_decoder_layers_16_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[892] gv472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc293: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv472, R.dtype("float16")) _291: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_16_fc1_weight2, alloc292, model_decoder_layers_16_fc1_bias2, alloc293) R.vm.kill_object(alloc292) R.vm.kill_object(model_decoder_layers_16_fc1_weight2) R.vm.kill_object(model_decoder_layers_16_fc1_bias2) model_decoder_layers_16_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] model_decoder_layers_16_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[894] gv473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc294: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv473, R.dtype("float16")) _292: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_16_fc2_weight2, alloc293, model_decoder_layers_16_fc2_bias2, alloc294) R.vm.kill_object(alloc293) R.vm.kill_object(model_decoder_layers_16_fc2_weight2) R.vm.kill_object(model_decoder_layers_16_fc2_bias2) gv474: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc295: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv474, R.dtype("float16")) cls.add5(alloc291, alloc294, alloc295) R.vm.kill_object(alloc291) R.vm.kill_object(alloc294) model_decoder_layers_17_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[904] model_decoder_layers_17_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[905] gv475: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc296: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv475, R.dtype("float16")) cls.layer_norm2(alloc295, model_decoder_layers_17_self_attn_layer_norm_weight2, model_decoder_layers_17_self_attn_layer_norm_bias2, alloc296) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias2) model_decoder_layers_17_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] model_decoder_layers_17_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[901] gv476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc297: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv476, R.dtype("float16")) _295: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_q_proj_weight2, alloc296, model_decoder_layers_17_self_attn_q_proj_bias2, alloc297) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias2) gv477: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape557: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc297, gv477, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc297) model_decoder_layers_17_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] gv478: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc298: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv478, R.dtype("float16")) _296: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_17_self_attn_k_proj_weight2, alloc296, alloc298) R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight2) gv479: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape558: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc298, gv479, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc298) model_decoder_layers_17_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] model_decoder_layers_17_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[899] gv480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc299: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv480, R.dtype("float16")) _297: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_v_proj_weight2, alloc296, model_decoder_layers_17_self_attn_v_proj_bias2, alloc299) R.vm.kill_object(alloc296) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias2) gv481: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape559: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc299, gv481, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc299) gv482: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc300: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv482, R.dtype("float16")) cls.concatenate1(reshape557, reshape558, reshape559, alloc300) R.vm.kill_object(reshape557) R.vm.kill_object(reshape558) R.vm.kill_object(reshape559) gv483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape560: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc300, gv483, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc300) gv484: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc301: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv484, R.dtype("float16")) _299: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape560, alloc301) R.vm.kill_object(reshape560) gv485: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape561: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc301, gv485, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc301) gv486: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape562: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape561, gv486, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape561) model_decoder_layers_17_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] model_decoder_layers_17_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[903] gv487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc302: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv487, R.dtype("float16")) _300: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_out_proj_weight2, reshape562, model_decoder_layers_17_self_attn_out_proj_bias2, alloc302) R.vm.kill_object(reshape562) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias2) gv488: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc303: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv488, R.dtype("float16")) cls.add5(alloc295, alloc302, alloc303) R.vm.kill_object(alloc295) R.vm.kill_object(alloc302) model_decoder_layers_17_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[913] model_decoder_layers_17_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[914] gv489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc304: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv489, R.dtype("float16")) cls.layer_norm2(alloc303, model_decoder_layers_17_encoder_attn_layer_norm_weight2, model_decoder_layers_17_encoder_attn_layer_norm_bias2, alloc304) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias2) model_decoder_layers_17_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] model_decoder_layers_17_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[910] gv490: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc305: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv490, R.dtype("float16")) _303: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight2, alloc304, model_decoder_layers_17_encoder_attn_q_proj_bias2, alloc305) R.vm.kill_object(alloc304) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias2) gv491: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape563: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc305, gv491, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc305) gv492: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape564: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape563, gv492, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape563) gv493: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc306: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv493, R.dtype("float16")) _304: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape564, alloc306) R.vm.kill_object(reshape564) gv494: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape565: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc306, gv494, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc306) gv495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape566: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape565, gv495, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape565) model_decoder_layers_17_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] model_decoder_layers_17_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[912] gv496: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc307: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv496, R.dtype("float16")) _305: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight2, reshape566, model_decoder_layers_17_encoder_attn_out_proj_bias2, alloc307) R.vm.kill_object(reshape566) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias2) gv497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc308: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv497, R.dtype("float16")) cls.add5(alloc303, alloc307, alloc308) R.vm.kill_object(alloc303) R.vm.kill_object(alloc307) model_decoder_layers_17_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[919] model_decoder_layers_17_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[920] gv498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc309: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv498, R.dtype("float16")) cls.layer_norm2(alloc308, model_decoder_layers_17_final_layer_norm_weight2, model_decoder_layers_17_final_layer_norm_bias2, alloc309) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias2) model_decoder_layers_17_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] model_decoder_layers_17_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[916] gv499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc310: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv499, R.dtype("float16")) _308: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_17_fc1_weight2, alloc309, model_decoder_layers_17_fc1_bias2, alloc310) R.vm.kill_object(alloc309) R.vm.kill_object(model_decoder_layers_17_fc1_weight2) R.vm.kill_object(model_decoder_layers_17_fc1_bias2) model_decoder_layers_17_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] model_decoder_layers_17_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[918] gv500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc311: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv500, R.dtype("float16")) _309: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_17_fc2_weight2, alloc310, model_decoder_layers_17_fc2_bias2, alloc311) R.vm.kill_object(alloc310) R.vm.kill_object(model_decoder_layers_17_fc2_weight2) R.vm.kill_object(model_decoder_layers_17_fc2_bias2) gv501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc312: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv501, R.dtype("float16")) cls.add5(alloc308, alloc311, alloc312) R.vm.kill_object(alloc308) R.vm.kill_object(alloc311) model_decoder_layers_18_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[928] model_decoder_layers_18_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[929] gv502: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc313: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv502, R.dtype("float16")) cls.layer_norm2(alloc312, model_decoder_layers_18_self_attn_layer_norm_weight2, model_decoder_layers_18_self_attn_layer_norm_bias2, alloc313) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias2) model_decoder_layers_18_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] model_decoder_layers_18_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[925] gv503: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc314: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv503, R.dtype("float16")) _312: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_q_proj_weight2, alloc313, model_decoder_layers_18_self_attn_q_proj_bias2, alloc314) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias2) gv504: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape567: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc314, gv504, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc314) model_decoder_layers_18_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] gv505: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc315: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv505, R.dtype("float16")) _313: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_18_self_attn_k_proj_weight2, alloc313, alloc315) R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight2) gv506: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape568: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc315, gv506, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc315) model_decoder_layers_18_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] model_decoder_layers_18_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[923] gv507: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc316: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv507, R.dtype("float16")) _314: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_v_proj_weight2, alloc313, model_decoder_layers_18_self_attn_v_proj_bias2, alloc316) R.vm.kill_object(alloc313) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias2) gv508: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape569: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc316, gv508, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc316) gv509: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc317: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv509, R.dtype("float16")) cls.concatenate1(reshape567, reshape568, reshape569, alloc317) R.vm.kill_object(reshape567) R.vm.kill_object(reshape568) R.vm.kill_object(reshape569) gv510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape570: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc317, gv510, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc317) gv511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc318: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv511, R.dtype("float16")) _316: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape570, alloc318) R.vm.kill_object(reshape570) gv512: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape571: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc318, gv512, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc318) gv513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape572: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape571, gv513, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape571) model_decoder_layers_18_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] model_decoder_layers_18_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[927] gv514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc319: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv514, R.dtype("float16")) _317: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_out_proj_weight2, reshape572, model_decoder_layers_18_self_attn_out_proj_bias2, alloc319) R.vm.kill_object(reshape572) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias2) gv515: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc320: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv515, R.dtype("float16")) cls.add5(alloc312, alloc319, alloc320) R.vm.kill_object(alloc312) R.vm.kill_object(alloc319) model_decoder_layers_18_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[937] model_decoder_layers_18_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[938] gv516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc321: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv516, R.dtype("float16")) cls.layer_norm2(alloc320, model_decoder_layers_18_encoder_attn_layer_norm_weight2, model_decoder_layers_18_encoder_attn_layer_norm_bias2, alloc321) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias2) model_decoder_layers_18_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] model_decoder_layers_18_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[934] gv517: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc322: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv517, R.dtype("float16")) _320: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight2, alloc321, model_decoder_layers_18_encoder_attn_q_proj_bias2, alloc322) R.vm.kill_object(alloc321) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias2) gv518: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape573: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc322, gv518, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc322) gv519: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape574: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape573, gv519, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape573) gv520: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc323: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv520, R.dtype("float16")) _321: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape574, alloc323) R.vm.kill_object(reshape574) gv521: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape575: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc323, gv521, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc323) gv522: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape576: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape575, gv522, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape575) model_decoder_layers_18_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] model_decoder_layers_18_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[936] gv523: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc324: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv523, R.dtype("float16")) _322: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight2, reshape576, model_decoder_layers_18_encoder_attn_out_proj_bias2, alloc324) R.vm.kill_object(reshape576) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias2) gv524: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc325: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv524, R.dtype("float16")) cls.add5(alloc320, alloc324, alloc325) R.vm.kill_object(alloc320) R.vm.kill_object(alloc324) model_decoder_layers_18_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[943] model_decoder_layers_18_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[944] gv525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc326: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv525, R.dtype("float16")) cls.layer_norm2(alloc325, model_decoder_layers_18_final_layer_norm_weight2, model_decoder_layers_18_final_layer_norm_bias2, alloc326) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias2) model_decoder_layers_18_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] model_decoder_layers_18_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[940] gv526: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc327: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv526, R.dtype("float16")) _325: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_18_fc1_weight2, alloc326, model_decoder_layers_18_fc1_bias2, alloc327) R.vm.kill_object(alloc326) R.vm.kill_object(model_decoder_layers_18_fc1_weight2) R.vm.kill_object(model_decoder_layers_18_fc1_bias2) model_decoder_layers_18_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] model_decoder_layers_18_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[942] gv527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc328: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv527, R.dtype("float16")) _326: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_18_fc2_weight2, alloc327, model_decoder_layers_18_fc2_bias2, alloc328) R.vm.kill_object(alloc327) R.vm.kill_object(model_decoder_layers_18_fc2_weight2) R.vm.kill_object(model_decoder_layers_18_fc2_bias2) gv528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc329: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv528, R.dtype("float16")) cls.add5(alloc325, alloc328, alloc329) R.vm.kill_object(alloc325) R.vm.kill_object(alloc328) model_decoder_layers_19_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[952] model_decoder_layers_19_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[953] gv529: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc330: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv529, R.dtype("float16")) cls.layer_norm2(alloc329, model_decoder_layers_19_self_attn_layer_norm_weight2, model_decoder_layers_19_self_attn_layer_norm_bias2, alloc330) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias2) model_decoder_layers_19_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] model_decoder_layers_19_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[949] gv530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc331: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv530, R.dtype("float16")) _329: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_q_proj_weight2, alloc330, model_decoder_layers_19_self_attn_q_proj_bias2, alloc331) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias2) gv531: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape577: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc331, gv531, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc331) model_decoder_layers_19_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] gv532: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc332: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv532, R.dtype("float16")) _330: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_19_self_attn_k_proj_weight2, alloc330, alloc332) R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight2) gv533: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape578: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc332, gv533, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc332) model_decoder_layers_19_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] model_decoder_layers_19_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[947] gv534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc333: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv534, R.dtype("float16")) _331: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_v_proj_weight2, alloc330, model_decoder_layers_19_self_attn_v_proj_bias2, alloc333) R.vm.kill_object(alloc330) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias2) gv535: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape579: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc333, gv535, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc333) gv536: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc334: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv536, R.dtype("float16")) cls.concatenate1(reshape577, reshape578, reshape579, alloc334) R.vm.kill_object(reshape577) R.vm.kill_object(reshape578) R.vm.kill_object(reshape579) gv537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape580: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc334, gv537, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc334) gv538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc335: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv538, R.dtype("float16")) _333: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape580, alloc335) R.vm.kill_object(reshape580) gv539: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape581: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc335, gv539, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc335) gv540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape582: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape581, gv540, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape581) model_decoder_layers_19_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] model_decoder_layers_19_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[951] gv541: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc336: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv541, R.dtype("float16")) _334: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_out_proj_weight2, reshape582, model_decoder_layers_19_self_attn_out_proj_bias2, alloc336) R.vm.kill_object(reshape582) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias2) gv542: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc337: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv542, R.dtype("float16")) cls.add5(alloc329, alloc336, alloc337) R.vm.kill_object(alloc329) R.vm.kill_object(alloc336) model_decoder_layers_19_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[961] model_decoder_layers_19_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[962] gv543: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc338: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv543, R.dtype("float16")) cls.layer_norm2(alloc337, model_decoder_layers_19_encoder_attn_layer_norm_weight2, model_decoder_layers_19_encoder_attn_layer_norm_bias2, alloc338) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias2) model_decoder_layers_19_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] model_decoder_layers_19_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[958] gv544: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc339: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv544, R.dtype("float16")) _337: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight2, alloc338, model_decoder_layers_19_encoder_attn_q_proj_bias2, alloc339) R.vm.kill_object(alloc338) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias2) gv545: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape583: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc339, gv545, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc339) gv546: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape584: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape583, gv546, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape583) gv547: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc340: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv547, R.dtype("float16")) _338: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape584, alloc340) R.vm.kill_object(reshape584) gv548: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape585: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc340, gv548, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc340) gv549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape586: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape585, gv549, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape585) model_decoder_layers_19_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] model_decoder_layers_19_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[960] gv550: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc341: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv550, R.dtype("float16")) _339: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight2, reshape586, model_decoder_layers_19_encoder_attn_out_proj_bias2, alloc341) R.vm.kill_object(reshape586) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias2) gv551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc342: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv551, R.dtype("float16")) cls.add5(alloc337, alloc341, alloc342) R.vm.kill_object(alloc337) R.vm.kill_object(alloc341) model_decoder_layers_19_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[967] model_decoder_layers_19_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[968] gv552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc343: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv552, R.dtype("float16")) cls.layer_norm2(alloc342, model_decoder_layers_19_final_layer_norm_weight2, model_decoder_layers_19_final_layer_norm_bias2, alloc343) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias2) model_decoder_layers_19_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] model_decoder_layers_19_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[964] gv553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc344: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv553, R.dtype("float16")) _342: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_19_fc1_weight2, alloc343, model_decoder_layers_19_fc1_bias2, alloc344) R.vm.kill_object(alloc343) R.vm.kill_object(model_decoder_layers_19_fc1_weight2) R.vm.kill_object(model_decoder_layers_19_fc1_bias2) model_decoder_layers_19_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] model_decoder_layers_19_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[966] gv554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc345: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv554, R.dtype("float16")) _343: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_19_fc2_weight2, alloc344, model_decoder_layers_19_fc2_bias2, alloc345) R.vm.kill_object(alloc344) R.vm.kill_object(model_decoder_layers_19_fc2_weight2) R.vm.kill_object(model_decoder_layers_19_fc2_bias2) gv555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc346: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv555, R.dtype("float16")) cls.add5(alloc342, alloc345, alloc346) R.vm.kill_object(alloc342) R.vm.kill_object(alloc345) model_decoder_layers_20_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[976] model_decoder_layers_20_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[977] gv556: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc347: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv556, R.dtype("float16")) cls.layer_norm2(alloc346, model_decoder_layers_20_self_attn_layer_norm_weight2, model_decoder_layers_20_self_attn_layer_norm_bias2, alloc347) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias2) model_decoder_layers_20_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] model_decoder_layers_20_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[973] gv557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc348: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv557, R.dtype("float16")) _346: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_q_proj_weight2, alloc347, model_decoder_layers_20_self_attn_q_proj_bias2, alloc348) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias2) gv558: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape587: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc348, gv558, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc348) model_decoder_layers_20_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] gv559: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc349: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv559, R.dtype("float16")) _347: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_20_self_attn_k_proj_weight2, alloc347, alloc349) R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight2) gv560: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape588: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc349, gv560, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc349) model_decoder_layers_20_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] model_decoder_layers_20_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[971] gv561: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc350: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv561, R.dtype("float16")) _348: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_v_proj_weight2, alloc347, model_decoder_layers_20_self_attn_v_proj_bias2, alloc350) R.vm.kill_object(alloc347) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias2) gv562: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape589: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc350, gv562, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc350) gv563: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc351: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv563, R.dtype("float16")) cls.concatenate1(reshape587, reshape588, reshape589, alloc351) R.vm.kill_object(reshape587) R.vm.kill_object(reshape588) R.vm.kill_object(reshape589) gv564: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape590: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc351, gv564, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc351) gv565: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc352: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv565, R.dtype("float16")) _350: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape590, alloc352) R.vm.kill_object(reshape590) gv566: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape591: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc352, gv566, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc352) gv567: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape592: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape591, gv567, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape591) model_decoder_layers_20_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] model_decoder_layers_20_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[975] gv568: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc353: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv568, R.dtype("float16")) _351: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_out_proj_weight2, reshape592, model_decoder_layers_20_self_attn_out_proj_bias2, alloc353) R.vm.kill_object(reshape592) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias2) gv569: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc354: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv569, R.dtype("float16")) cls.add5(alloc346, alloc353, alloc354) R.vm.kill_object(alloc346) R.vm.kill_object(alloc353) model_decoder_layers_20_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[985] model_decoder_layers_20_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[986] gv570: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc355: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv570, R.dtype("float16")) cls.layer_norm2(alloc354, model_decoder_layers_20_encoder_attn_layer_norm_weight2, model_decoder_layers_20_encoder_attn_layer_norm_bias2, alloc355) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias2) model_decoder_layers_20_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] model_decoder_layers_20_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[982] gv571: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc356: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv571, R.dtype("float16")) _354: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight2, alloc355, model_decoder_layers_20_encoder_attn_q_proj_bias2, alloc356) R.vm.kill_object(alloc355) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias2) gv572: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape593: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc356, gv572, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc356) gv573: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape594: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape593, gv573, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape593) gv574: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc357: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv574, R.dtype("float16")) _355: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape594, alloc357) R.vm.kill_object(reshape594) gv575: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape595: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc357, gv575, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc357) gv576: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape596: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape595, gv576, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape595) model_decoder_layers_20_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] model_decoder_layers_20_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[984] gv577: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc358: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv577, R.dtype("float16")) _356: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight2, reshape596, model_decoder_layers_20_encoder_attn_out_proj_bias2, alloc358) R.vm.kill_object(reshape596) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias2) gv578: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc359: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv578, R.dtype("float16")) cls.add5(alloc354, alloc358, alloc359) R.vm.kill_object(alloc354) R.vm.kill_object(alloc358) model_decoder_layers_20_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[991] model_decoder_layers_20_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[992] gv579: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc360: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv579, R.dtype("float16")) cls.layer_norm2(alloc359, model_decoder_layers_20_final_layer_norm_weight2, model_decoder_layers_20_final_layer_norm_bias2, alloc360) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias2) model_decoder_layers_20_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] model_decoder_layers_20_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[988] gv580: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc361: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv580, R.dtype("float16")) _359: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_20_fc1_weight2, alloc360, model_decoder_layers_20_fc1_bias2, alloc361) R.vm.kill_object(alloc360) R.vm.kill_object(model_decoder_layers_20_fc1_weight2) R.vm.kill_object(model_decoder_layers_20_fc1_bias2) model_decoder_layers_20_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] model_decoder_layers_20_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[990] gv581: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc362: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv581, R.dtype("float16")) _360: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_20_fc2_weight2, alloc361, model_decoder_layers_20_fc2_bias2, alloc362) R.vm.kill_object(alloc361) R.vm.kill_object(model_decoder_layers_20_fc2_weight2) R.vm.kill_object(model_decoder_layers_20_fc2_bias2) gv582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc363: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv582, R.dtype("float16")) cls.add5(alloc359, alloc362, alloc363) R.vm.kill_object(alloc359) R.vm.kill_object(alloc362) model_decoder_layers_21_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1000] model_decoder_layers_21_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1001] gv583: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc364: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv583, R.dtype("float16")) cls.layer_norm2(alloc363, model_decoder_layers_21_self_attn_layer_norm_weight2, model_decoder_layers_21_self_attn_layer_norm_bias2, alloc364) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias2) model_decoder_layers_21_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] model_decoder_layers_21_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[997] gv584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc365: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv584, R.dtype("float16")) _363: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_q_proj_weight2, alloc364, model_decoder_layers_21_self_attn_q_proj_bias2, alloc365) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias2) gv585: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape597: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc365, gv585, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc365) model_decoder_layers_21_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] gv586: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc366: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv586, R.dtype("float16")) _364: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_21_self_attn_k_proj_weight2, alloc364, alloc366) R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight2) gv587: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape598: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc366, gv587, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc366) model_decoder_layers_21_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] model_decoder_layers_21_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[995] gv588: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc367: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv588, R.dtype("float16")) _365: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_v_proj_weight2, alloc364, model_decoder_layers_21_self_attn_v_proj_bias2, alloc367) R.vm.kill_object(alloc364) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias2) gv589: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape599: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc367, gv589, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc367) gv590: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc368: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv590, R.dtype("float16")) cls.concatenate1(reshape597, reshape598, reshape599, alloc368) R.vm.kill_object(reshape597) R.vm.kill_object(reshape598) R.vm.kill_object(reshape599) gv591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape600: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc368, gv591, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc368) gv592: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc369: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv592, R.dtype("float16")) _367: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape600, alloc369) R.vm.kill_object(reshape600) gv593: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape601: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc369, gv593, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc369) gv594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape602: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape601, gv594, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape601) model_decoder_layers_21_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] model_decoder_layers_21_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[999] gv595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc370: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv595, R.dtype("float16")) _368: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_out_proj_weight2, reshape602, model_decoder_layers_21_self_attn_out_proj_bias2, alloc370) R.vm.kill_object(reshape602) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias2) gv596: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc371: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv596, R.dtype("float16")) cls.add5(alloc363, alloc370, alloc371) R.vm.kill_object(alloc363) R.vm.kill_object(alloc370) model_decoder_layers_21_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1009] model_decoder_layers_21_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1010] gv597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc372: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv597, R.dtype("float16")) cls.layer_norm2(alloc371, model_decoder_layers_21_encoder_attn_layer_norm_weight2, model_decoder_layers_21_encoder_attn_layer_norm_bias2, alloc372) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias2) model_decoder_layers_21_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] model_decoder_layers_21_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1006] gv598: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc373: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv598, R.dtype("float16")) _371: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight2, alloc372, model_decoder_layers_21_encoder_attn_q_proj_bias2, alloc373) R.vm.kill_object(alloc372) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias2) gv599: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape603: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc373, gv599, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc373) gv600: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape604: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape603, gv600, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape603) gv601: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc374: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv601, R.dtype("float16")) _372: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape604, alloc374) R.vm.kill_object(reshape604) gv602: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape605: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc374, gv602, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc374) gv603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape606: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape605, gv603, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape605) model_decoder_layers_21_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] model_decoder_layers_21_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1008] gv604: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc375: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv604, R.dtype("float16")) _373: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight2, reshape606, model_decoder_layers_21_encoder_attn_out_proj_bias2, alloc375) R.vm.kill_object(reshape606) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias2) gv605: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc376: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv605, R.dtype("float16")) cls.add5(alloc371, alloc375, alloc376) R.vm.kill_object(alloc371) R.vm.kill_object(alloc375) model_decoder_layers_21_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1015] model_decoder_layers_21_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1016] gv606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc377: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv606, R.dtype("float16")) cls.layer_norm2(alloc376, model_decoder_layers_21_final_layer_norm_weight2, model_decoder_layers_21_final_layer_norm_bias2, alloc377) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias2) model_decoder_layers_21_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] model_decoder_layers_21_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1012] gv607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc378: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv607, R.dtype("float16")) _376: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_21_fc1_weight2, alloc377, model_decoder_layers_21_fc1_bias2, alloc378) R.vm.kill_object(alloc377) R.vm.kill_object(model_decoder_layers_21_fc1_weight2) R.vm.kill_object(model_decoder_layers_21_fc1_bias2) model_decoder_layers_21_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] model_decoder_layers_21_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1014] gv608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc379: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv608, R.dtype("float16")) _377: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_21_fc2_weight2, alloc378, model_decoder_layers_21_fc2_bias2, alloc379) R.vm.kill_object(alloc378) R.vm.kill_object(model_decoder_layers_21_fc2_weight2) R.vm.kill_object(model_decoder_layers_21_fc2_bias2) gv609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc380: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv609, R.dtype("float16")) cls.add5(alloc376, alloc379, alloc380) R.vm.kill_object(alloc376) R.vm.kill_object(alloc379) model_decoder_layers_22_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1024] model_decoder_layers_22_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1025] gv610: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc381: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv610, R.dtype("float16")) cls.layer_norm2(alloc380, model_decoder_layers_22_self_attn_layer_norm_weight2, model_decoder_layers_22_self_attn_layer_norm_bias2, alloc381) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias2) model_decoder_layers_22_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] model_decoder_layers_22_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1021] gv611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc382: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv611, R.dtype("float16")) _380: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_q_proj_weight2, alloc381, model_decoder_layers_22_self_attn_q_proj_bias2, alloc382) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias2) gv612: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape607: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc382, gv612, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc382) model_decoder_layers_22_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] gv613: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc383: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv613, R.dtype("float16")) _381: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_22_self_attn_k_proj_weight2, alloc381, alloc383) R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight2) gv614: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape608: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc383, gv614, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc383) model_decoder_layers_22_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] model_decoder_layers_22_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1019] gv615: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc384: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv615, R.dtype("float16")) _382: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_v_proj_weight2, alloc381, model_decoder_layers_22_self_attn_v_proj_bias2, alloc384) R.vm.kill_object(alloc381) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias2) gv616: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape609: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc384, gv616, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc384) gv617: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc385: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv617, R.dtype("float16")) cls.concatenate1(reshape607, reshape608, reshape609, alloc385) R.vm.kill_object(reshape607) R.vm.kill_object(reshape608) R.vm.kill_object(reshape609) gv618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape610: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc385, gv618, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc385) gv619: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc386: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv619, R.dtype("float16")) _384: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape610, alloc386) R.vm.kill_object(reshape610) gv620: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape611: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc386, gv620, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc386) gv621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape612: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape611, gv621, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape611) model_decoder_layers_22_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] model_decoder_layers_22_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1023] gv622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc387: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv622, R.dtype("float16")) _385: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_out_proj_weight2, reshape612, model_decoder_layers_22_self_attn_out_proj_bias2, alloc387) R.vm.kill_object(reshape612) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias2) gv623: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc388: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv623, R.dtype("float16")) cls.add5(alloc380, alloc387, alloc388) R.vm.kill_object(alloc380) R.vm.kill_object(alloc387) model_decoder_layers_22_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1033] model_decoder_layers_22_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1034] gv624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc389: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv624, R.dtype("float16")) cls.layer_norm2(alloc388, model_decoder_layers_22_encoder_attn_layer_norm_weight2, model_decoder_layers_22_encoder_attn_layer_norm_bias2, alloc389) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias2) model_decoder_layers_22_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] model_decoder_layers_22_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1030] gv625: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc390: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv625, R.dtype("float16")) _388: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight2, alloc389, model_decoder_layers_22_encoder_attn_q_proj_bias2, alloc390) R.vm.kill_object(alloc389) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias2) gv626: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape613: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc390, gv626, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc390) gv627: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape614: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape613, gv627, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape613) gv628: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc391: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv628, R.dtype("float16")) _389: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape614, alloc391) R.vm.kill_object(reshape614) gv629: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape615: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc391, gv629, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc391) gv630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape616: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape615, gv630, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape615) model_decoder_layers_22_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] model_decoder_layers_22_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1032] gv631: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc392: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv631, R.dtype("float16")) _390: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight2, reshape616, model_decoder_layers_22_encoder_attn_out_proj_bias2, alloc392) R.vm.kill_object(reshape616) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias2) gv632: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc393: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv632, R.dtype("float16")) cls.add5(alloc388, alloc392, alloc393) R.vm.kill_object(alloc388) R.vm.kill_object(alloc392) model_decoder_layers_22_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1039] model_decoder_layers_22_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1040] gv633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc394: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv633, R.dtype("float16")) cls.layer_norm2(alloc393, model_decoder_layers_22_final_layer_norm_weight2, model_decoder_layers_22_final_layer_norm_bias2, alloc394) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias2) model_decoder_layers_22_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] model_decoder_layers_22_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1036] gv634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc395: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv634, R.dtype("float16")) _393: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_22_fc1_weight2, alloc394, model_decoder_layers_22_fc1_bias2, alloc395) R.vm.kill_object(alloc394) R.vm.kill_object(model_decoder_layers_22_fc1_weight2) R.vm.kill_object(model_decoder_layers_22_fc1_bias2) model_decoder_layers_22_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] model_decoder_layers_22_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1038] gv635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc396: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv635, R.dtype("float16")) _394: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_22_fc2_weight2, alloc395, model_decoder_layers_22_fc2_bias2, alloc396) R.vm.kill_object(alloc395) R.vm.kill_object(model_decoder_layers_22_fc2_weight2) R.vm.kill_object(model_decoder_layers_22_fc2_bias2) gv636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc397: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv636, R.dtype("float16")) cls.add5(alloc393, alloc396, alloc397) R.vm.kill_object(alloc393) R.vm.kill_object(alloc396) model_decoder_layers_23_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1048] model_decoder_layers_23_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1049] gv637: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc398: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv637, R.dtype("float16")) cls.layer_norm2(alloc397, model_decoder_layers_23_self_attn_layer_norm_weight2, model_decoder_layers_23_self_attn_layer_norm_bias2, alloc398) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias2) model_decoder_layers_23_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] model_decoder_layers_23_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1045] gv638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc399: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv638, R.dtype("float16")) _397: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_q_proj_weight2, alloc398, model_decoder_layers_23_self_attn_q_proj_bias2, alloc399) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias2) gv639: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape617: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc399, gv639, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc399) model_decoder_layers_23_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] gv640: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc400: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv640, R.dtype("float16")) _398: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_23_self_attn_k_proj_weight2, alloc398, alloc400) R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight2) gv641: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape618: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc400, gv641, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc400) model_decoder_layers_23_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] model_decoder_layers_23_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1043] gv642: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc401: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv642, R.dtype("float16")) _399: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_v_proj_weight2, alloc398, model_decoder_layers_23_self_attn_v_proj_bias2, alloc401) R.vm.kill_object(alloc398) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias2) gv643: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape619: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc401, gv643, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc401) gv644: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc402: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv644, R.dtype("float16")) cls.concatenate1(reshape617, reshape618, reshape619, alloc402) R.vm.kill_object(reshape617) R.vm.kill_object(reshape618) R.vm.kill_object(reshape619) gv645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape620: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc402, gv645, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc402) gv646: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc403: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv646, R.dtype("float16")) _401: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape620, alloc403) R.vm.kill_object(reshape620) gv647: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape621: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc403, gv647, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc403) gv648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape622: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape621, gv648, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape621) model_decoder_layers_23_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] model_decoder_layers_23_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1047] gv649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc404: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv649, R.dtype("float16")) _402: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_out_proj_weight2, reshape622, model_decoder_layers_23_self_attn_out_proj_bias2, alloc404) R.vm.kill_object(reshape622) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias2) gv650: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc405: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv650, R.dtype("float16")) cls.add5(alloc397, alloc404, alloc405) R.vm.kill_object(alloc397) R.vm.kill_object(alloc404) model_decoder_layers_23_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1057] model_decoder_layers_23_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1058] gv651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc406: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv651, R.dtype("float16")) cls.layer_norm2(alloc405, model_decoder_layers_23_encoder_attn_layer_norm_weight2, model_decoder_layers_23_encoder_attn_layer_norm_bias2, alloc406) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias2) model_decoder_layers_23_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] model_decoder_layers_23_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1054] gv652: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc407: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv652, R.dtype("float16")) _405: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight2, alloc406, model_decoder_layers_23_encoder_attn_q_proj_bias2, alloc407) R.vm.kill_object(alloc406) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias2) gv653: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape623: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc407, gv653, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc407) gv654: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape624: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape623, gv654, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape623) gv655: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc408: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv655, R.dtype("float16")) _406: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape624, alloc408) R.vm.kill_object(reshape624) gv656: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape625: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc408, gv656, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc408) gv657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape626: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape625, gv657, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape625) model_decoder_layers_23_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] model_decoder_layers_23_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1056] gv658: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc409: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv658, R.dtype("float16")) _407: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight2, reshape626, model_decoder_layers_23_encoder_attn_out_proj_bias2, alloc409) R.vm.kill_object(reshape626) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias2) gv659: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc410: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv659, R.dtype("float16")) cls.add5(alloc405, alloc409, alloc410) R.vm.kill_object(alloc405) R.vm.kill_object(alloc409) model_decoder_layers_23_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1063] model_decoder_layers_23_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1064] gv660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc411: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv660, R.dtype("float16")) cls.layer_norm2(alloc410, model_decoder_layers_23_final_layer_norm_weight2, model_decoder_layers_23_final_layer_norm_bias2, alloc411) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias2) model_decoder_layers_23_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] model_decoder_layers_23_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1060] gv661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc412: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv661, R.dtype("float16")) _410: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_23_fc1_weight2, alloc411, model_decoder_layers_23_fc1_bias2, alloc412) R.vm.kill_object(alloc411) R.vm.kill_object(model_decoder_layers_23_fc1_weight2) R.vm.kill_object(model_decoder_layers_23_fc1_bias2) model_decoder_layers_23_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] model_decoder_layers_23_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1062] gv662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc413: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv662, R.dtype("float16")) _411: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_23_fc2_weight2, alloc412, model_decoder_layers_23_fc2_bias2, alloc413) R.vm.kill_object(alloc412) R.vm.kill_object(model_decoder_layers_23_fc2_weight2) R.vm.kill_object(model_decoder_layers_23_fc2_bias2) gv663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc414: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv663, R.dtype("float16")) cls.add5(alloc410, alloc413, alloc414) R.vm.kill_object(alloc410) R.vm.kill_object(alloc413) model_decoder_layers_24_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1072] model_decoder_layers_24_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1073] gv664: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc415: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv664, R.dtype("float16")) cls.layer_norm2(alloc414, model_decoder_layers_24_self_attn_layer_norm_weight2, model_decoder_layers_24_self_attn_layer_norm_bias2, alloc415) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias2) model_decoder_layers_24_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] model_decoder_layers_24_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1069] gv665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc416: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv665, R.dtype("float16")) _414: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_q_proj_weight2, alloc415, model_decoder_layers_24_self_attn_q_proj_bias2, alloc416) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias2) gv666: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape627: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc416, gv666, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc416) model_decoder_layers_24_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] gv667: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc417: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv667, R.dtype("float16")) _415: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_24_self_attn_k_proj_weight2, alloc415, alloc417) R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight2) gv668: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape628: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc417, gv668, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc417) model_decoder_layers_24_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] model_decoder_layers_24_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1067] gv669: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc418: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv669, R.dtype("float16")) _416: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_v_proj_weight2, alloc415, model_decoder_layers_24_self_attn_v_proj_bias2, alloc418) R.vm.kill_object(alloc415) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias2) gv670: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape629: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc418, gv670, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc418) gv671: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc419: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv671, R.dtype("float16")) cls.concatenate1(reshape627, reshape628, reshape629, alloc419) R.vm.kill_object(reshape627) R.vm.kill_object(reshape628) R.vm.kill_object(reshape629) gv672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape630: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc419, gv672, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc419) gv673: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc420: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv673, R.dtype("float16")) _418: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape630, alloc420) R.vm.kill_object(reshape630) gv674: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape631: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc420, gv674, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc420) gv675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape632: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape631, gv675, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape631) model_decoder_layers_24_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] model_decoder_layers_24_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1071] gv676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc421: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv676, R.dtype("float16")) _419: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_out_proj_weight2, reshape632, model_decoder_layers_24_self_attn_out_proj_bias2, alloc421) R.vm.kill_object(reshape632) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias2) gv677: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc422: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv677, R.dtype("float16")) cls.add5(alloc414, alloc421, alloc422) R.vm.kill_object(alloc414) R.vm.kill_object(alloc421) model_decoder_layers_24_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1081] model_decoder_layers_24_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1082] gv678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc423: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv678, R.dtype("float16")) cls.layer_norm2(alloc422, model_decoder_layers_24_encoder_attn_layer_norm_weight2, model_decoder_layers_24_encoder_attn_layer_norm_bias2, alloc423) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias2) model_decoder_layers_24_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] model_decoder_layers_24_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1078] gv679: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc424: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv679, R.dtype("float16")) _422: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight2, alloc423, model_decoder_layers_24_encoder_attn_q_proj_bias2, alloc424) R.vm.kill_object(alloc423) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias2) gv680: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape633: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc424, gv680, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc424) gv681: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape634: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape633, gv681, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape633) gv682: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc425: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv682, R.dtype("float16")) _423: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape634, alloc425) R.vm.kill_object(reshape634) gv683: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape635: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc425, gv683, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc425) gv684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape636: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape635, gv684, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape635) model_decoder_layers_24_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] model_decoder_layers_24_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1080] gv685: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc426: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv685, R.dtype("float16")) _424: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight2, reshape636, model_decoder_layers_24_encoder_attn_out_proj_bias2, alloc426) R.vm.kill_object(reshape636) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias2) gv686: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc427: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv686, R.dtype("float16")) cls.add5(alloc422, alloc426, alloc427) R.vm.kill_object(alloc422) R.vm.kill_object(alloc426) model_decoder_layers_24_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1087] model_decoder_layers_24_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1088] gv687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc428: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv687, R.dtype("float16")) cls.layer_norm2(alloc427, model_decoder_layers_24_final_layer_norm_weight2, model_decoder_layers_24_final_layer_norm_bias2, alloc428) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias2) model_decoder_layers_24_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] model_decoder_layers_24_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1084] gv688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc429: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv688, R.dtype("float16")) _427: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_24_fc1_weight2, alloc428, model_decoder_layers_24_fc1_bias2, alloc429) R.vm.kill_object(alloc428) R.vm.kill_object(model_decoder_layers_24_fc1_weight2) R.vm.kill_object(model_decoder_layers_24_fc1_bias2) model_decoder_layers_24_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] model_decoder_layers_24_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1086] gv689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc430: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv689, R.dtype("float16")) _428: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_24_fc2_weight2, alloc429, model_decoder_layers_24_fc2_bias2, alloc430) R.vm.kill_object(alloc429) R.vm.kill_object(model_decoder_layers_24_fc2_weight2) R.vm.kill_object(model_decoder_layers_24_fc2_bias2) gv690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc431: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv690, R.dtype("float16")) cls.add5(alloc427, alloc430, alloc431) R.vm.kill_object(alloc427) R.vm.kill_object(alloc430) model_decoder_layers_25_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1096] model_decoder_layers_25_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1097] gv691: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc432: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv691, R.dtype("float16")) cls.layer_norm2(alloc431, model_decoder_layers_25_self_attn_layer_norm_weight2, model_decoder_layers_25_self_attn_layer_norm_bias2, alloc432) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias2) model_decoder_layers_25_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] model_decoder_layers_25_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1093] gv692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc433: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv692, R.dtype("float16")) _431: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_q_proj_weight2, alloc432, model_decoder_layers_25_self_attn_q_proj_bias2, alloc433) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias2) gv693: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape637: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc433, gv693, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc433) model_decoder_layers_25_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] gv694: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc434: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv694, R.dtype("float16")) _432: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_25_self_attn_k_proj_weight2, alloc432, alloc434) R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight2) gv695: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape638: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc434, gv695, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc434) model_decoder_layers_25_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] model_decoder_layers_25_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1091] gv696: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc435: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv696, R.dtype("float16")) _433: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_v_proj_weight2, alloc432, model_decoder_layers_25_self_attn_v_proj_bias2, alloc435) R.vm.kill_object(alloc432) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias2) gv697: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape639: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc435, gv697, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc435) gv698: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc436: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv698, R.dtype("float16")) cls.concatenate1(reshape637, reshape638, reshape639, alloc436) R.vm.kill_object(reshape637) R.vm.kill_object(reshape638) R.vm.kill_object(reshape639) gv699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape640: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc436, gv699, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc436) gv700: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc437: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv700, R.dtype("float16")) _435: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape640, alloc437) R.vm.kill_object(reshape640) gv701: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape641: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc437, gv701, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc437) gv702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape642: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape641, gv702, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape641) model_decoder_layers_25_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] model_decoder_layers_25_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1095] gv703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc438: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv703, R.dtype("float16")) _436: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_out_proj_weight2, reshape642, model_decoder_layers_25_self_attn_out_proj_bias2, alloc438) R.vm.kill_object(reshape642) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias2) gv704: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc439: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv704, R.dtype("float16")) cls.add5(alloc431, alloc438, alloc439) R.vm.kill_object(alloc431) R.vm.kill_object(alloc438) model_decoder_layers_25_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1105] model_decoder_layers_25_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1106] gv705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc440: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv705, R.dtype("float16")) cls.layer_norm2(alloc439, model_decoder_layers_25_encoder_attn_layer_norm_weight2, model_decoder_layers_25_encoder_attn_layer_norm_bias2, alloc440) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias2) model_decoder_layers_25_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] model_decoder_layers_25_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1102] gv706: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc441: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv706, R.dtype("float16")) _439: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight2, alloc440, model_decoder_layers_25_encoder_attn_q_proj_bias2, alloc441) R.vm.kill_object(alloc440) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias2) gv707: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape643: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc441, gv707, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc441) gv708: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape644: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape643, gv708, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape643) gv709: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc442: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv709, R.dtype("float16")) _440: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape644, alloc442) R.vm.kill_object(reshape644) gv710: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape645: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc442, gv710, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc442) gv711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape646: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape645, gv711, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape645) model_decoder_layers_25_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] model_decoder_layers_25_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1104] gv712: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc443: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv712, R.dtype("float16")) _441: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight2, reshape646, model_decoder_layers_25_encoder_attn_out_proj_bias2, alloc443) R.vm.kill_object(reshape646) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias2) gv713: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc444: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv713, R.dtype("float16")) cls.add5(alloc439, alloc443, alloc444) R.vm.kill_object(alloc439) R.vm.kill_object(alloc443) model_decoder_layers_25_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1111] model_decoder_layers_25_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1112] gv714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc445: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv714, R.dtype("float16")) cls.layer_norm2(alloc444, model_decoder_layers_25_final_layer_norm_weight2, model_decoder_layers_25_final_layer_norm_bias2, alloc445) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias2) model_decoder_layers_25_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] model_decoder_layers_25_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1108] gv715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc446: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv715, R.dtype("float16")) _444: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_25_fc1_weight2, alloc445, model_decoder_layers_25_fc1_bias2, alloc446) R.vm.kill_object(alloc445) R.vm.kill_object(model_decoder_layers_25_fc1_weight2) R.vm.kill_object(model_decoder_layers_25_fc1_bias2) model_decoder_layers_25_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] model_decoder_layers_25_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1110] gv716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc447: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv716, R.dtype("float16")) _445: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_25_fc2_weight2, alloc446, model_decoder_layers_25_fc2_bias2, alloc447) R.vm.kill_object(alloc446) R.vm.kill_object(model_decoder_layers_25_fc2_weight2) R.vm.kill_object(model_decoder_layers_25_fc2_bias2) gv717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc448: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv717, R.dtype("float16")) cls.add5(alloc444, alloc447, alloc448) R.vm.kill_object(alloc444) R.vm.kill_object(alloc447) model_decoder_layers_26_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1120] model_decoder_layers_26_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1121] gv718: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc449: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv718, R.dtype("float16")) cls.layer_norm2(alloc448, model_decoder_layers_26_self_attn_layer_norm_weight2, model_decoder_layers_26_self_attn_layer_norm_bias2, alloc449) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias2) model_decoder_layers_26_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] model_decoder_layers_26_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1117] gv719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc450: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv719, R.dtype("float16")) _448: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_q_proj_weight2, alloc449, model_decoder_layers_26_self_attn_q_proj_bias2, alloc450) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias2) gv720: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape647: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc450, gv720, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc450) model_decoder_layers_26_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] gv721: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc451: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv721, R.dtype("float16")) _449: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_26_self_attn_k_proj_weight2, alloc449, alloc451) R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight2) gv722: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape648: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc451, gv722, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc451) model_decoder_layers_26_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] model_decoder_layers_26_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1115] gv723: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc452: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv723, R.dtype("float16")) _450: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_v_proj_weight2, alloc449, model_decoder_layers_26_self_attn_v_proj_bias2, alloc452) R.vm.kill_object(alloc449) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias2) gv724: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape649: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc452, gv724, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc452) gv725: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc453: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv725, R.dtype("float16")) cls.concatenate1(reshape647, reshape648, reshape649, alloc453) R.vm.kill_object(reshape647) R.vm.kill_object(reshape648) R.vm.kill_object(reshape649) gv726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape650: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc453, gv726, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc453) gv727: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc454: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv727, R.dtype("float16")) _452: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape650, alloc454) R.vm.kill_object(reshape650) gv728: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape651: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc454, gv728, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc454) gv729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape652: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape651, gv729, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape651) model_decoder_layers_26_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] model_decoder_layers_26_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1119] gv730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc455: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv730, R.dtype("float16")) _453: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_out_proj_weight2, reshape652, model_decoder_layers_26_self_attn_out_proj_bias2, alloc455) R.vm.kill_object(reshape652) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias2) gv731: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc456: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv731, R.dtype("float16")) cls.add5(alloc448, alloc455, alloc456) R.vm.kill_object(alloc448) R.vm.kill_object(alloc455) model_decoder_layers_26_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1129] model_decoder_layers_26_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1130] gv732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc457: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv732, R.dtype("float16")) cls.layer_norm2(alloc456, model_decoder_layers_26_encoder_attn_layer_norm_weight2, model_decoder_layers_26_encoder_attn_layer_norm_bias2, alloc457) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias2) model_decoder_layers_26_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] model_decoder_layers_26_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1126] gv733: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc458: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv733, R.dtype("float16")) _456: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight2, alloc457, model_decoder_layers_26_encoder_attn_q_proj_bias2, alloc458) R.vm.kill_object(alloc457) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias2) gv734: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape653: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc458, gv734, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc458) gv735: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape654: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape653, gv735, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape653) gv736: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc459: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv736, R.dtype("float16")) _457: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape654, alloc459) R.vm.kill_object(reshape654) gv737: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape655: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc459, gv737, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc459) gv738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape656: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape655, gv738, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape655) model_decoder_layers_26_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] model_decoder_layers_26_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1128] gv739: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc460: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv739, R.dtype("float16")) _458: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight2, reshape656, model_decoder_layers_26_encoder_attn_out_proj_bias2, alloc460) R.vm.kill_object(reshape656) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias2) gv740: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc461: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv740, R.dtype("float16")) cls.add5(alloc456, alloc460, alloc461) R.vm.kill_object(alloc456) R.vm.kill_object(alloc460) model_decoder_layers_26_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1135] model_decoder_layers_26_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1136] gv741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc462: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv741, R.dtype("float16")) cls.layer_norm2(alloc461, model_decoder_layers_26_final_layer_norm_weight2, model_decoder_layers_26_final_layer_norm_bias2, alloc462) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias2) model_decoder_layers_26_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] model_decoder_layers_26_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1132] gv742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc463: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv742, R.dtype("float16")) _461: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_26_fc1_weight2, alloc462, model_decoder_layers_26_fc1_bias2, alloc463) R.vm.kill_object(alloc462) R.vm.kill_object(model_decoder_layers_26_fc1_weight2) R.vm.kill_object(model_decoder_layers_26_fc1_bias2) model_decoder_layers_26_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] model_decoder_layers_26_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1134] gv743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc464: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv743, R.dtype("float16")) _462: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_26_fc2_weight2, alloc463, model_decoder_layers_26_fc2_bias2, alloc464) R.vm.kill_object(alloc463) R.vm.kill_object(model_decoder_layers_26_fc2_weight2) R.vm.kill_object(model_decoder_layers_26_fc2_bias2) gv744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc465: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv744, R.dtype("float16")) cls.add5(alloc461, alloc464, alloc465) R.vm.kill_object(alloc461) R.vm.kill_object(alloc464) model_decoder_layers_27_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1144] model_decoder_layers_27_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1145] gv745: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc466: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv745, R.dtype("float16")) cls.layer_norm2(alloc465, model_decoder_layers_27_self_attn_layer_norm_weight2, model_decoder_layers_27_self_attn_layer_norm_bias2, alloc466) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias2) model_decoder_layers_27_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] model_decoder_layers_27_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1141] gv746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc467: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv746, R.dtype("float16")) _465: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_q_proj_weight2, alloc466, model_decoder_layers_27_self_attn_q_proj_bias2, alloc467) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias2) gv747: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape657: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc467, gv747, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc467) model_decoder_layers_27_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] gv748: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc468: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv748, R.dtype("float16")) _466: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_27_self_attn_k_proj_weight2, alloc466, alloc468) R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight2) gv749: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape658: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc468, gv749, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc468) model_decoder_layers_27_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] model_decoder_layers_27_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1139] gv750: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc469: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv750, R.dtype("float16")) _467: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_v_proj_weight2, alloc466, model_decoder_layers_27_self_attn_v_proj_bias2, alloc469) R.vm.kill_object(alloc466) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias2) gv751: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape659: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc469, gv751, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc469) gv752: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc470: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv752, R.dtype("float16")) cls.concatenate1(reshape657, reshape658, reshape659, alloc470) R.vm.kill_object(reshape657) R.vm.kill_object(reshape658) R.vm.kill_object(reshape659) gv753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape660: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc470, gv753, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc470) gv754: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc471: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv754, R.dtype("float16")) _469: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape660, alloc471) R.vm.kill_object(reshape660) gv755: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape661: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc471, gv755, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc471) gv756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape662: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape661, gv756, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape661) model_decoder_layers_27_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] model_decoder_layers_27_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1143] gv757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc472: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv757, R.dtype("float16")) _470: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_out_proj_weight2, reshape662, model_decoder_layers_27_self_attn_out_proj_bias2, alloc472) R.vm.kill_object(reshape662) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias2) gv758: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc473: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv758, R.dtype("float16")) cls.add5(alloc465, alloc472, alloc473) R.vm.kill_object(alloc465) R.vm.kill_object(alloc472) model_decoder_layers_27_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1153] model_decoder_layers_27_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1154] gv759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc474: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv759, R.dtype("float16")) cls.layer_norm2(alloc473, model_decoder_layers_27_encoder_attn_layer_norm_weight2, model_decoder_layers_27_encoder_attn_layer_norm_bias2, alloc474) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias2) model_decoder_layers_27_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] model_decoder_layers_27_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1150] gv760: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc475: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv760, R.dtype("float16")) _473: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight2, alloc474, model_decoder_layers_27_encoder_attn_q_proj_bias2, alloc475) R.vm.kill_object(alloc474) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias2) gv761: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape663: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc475, gv761, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc475) gv762: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape664: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape663, gv762, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape663) gv763: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc476: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv763, R.dtype("float16")) _474: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape664, alloc476) R.vm.kill_object(reshape664) gv764: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape665: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc476, gv764, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc476) gv765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape666: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape665, gv765, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape665) model_decoder_layers_27_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] model_decoder_layers_27_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1152] gv766: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc477: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv766, R.dtype("float16")) _475: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight2, reshape666, model_decoder_layers_27_encoder_attn_out_proj_bias2, alloc477) R.vm.kill_object(reshape666) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias2) gv767: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc478: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv767, R.dtype("float16")) cls.add5(alloc473, alloc477, alloc478) R.vm.kill_object(alloc473) R.vm.kill_object(alloc477) model_decoder_layers_27_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1159] model_decoder_layers_27_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1160] gv768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc479: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv768, R.dtype("float16")) cls.layer_norm2(alloc478, model_decoder_layers_27_final_layer_norm_weight2, model_decoder_layers_27_final_layer_norm_bias2, alloc479) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias2) model_decoder_layers_27_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] model_decoder_layers_27_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1156] gv769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc480: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv769, R.dtype("float16")) _478: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_27_fc1_weight2, alloc479, model_decoder_layers_27_fc1_bias2, alloc480) R.vm.kill_object(alloc479) R.vm.kill_object(model_decoder_layers_27_fc1_weight2) R.vm.kill_object(model_decoder_layers_27_fc1_bias2) model_decoder_layers_27_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] model_decoder_layers_27_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1158] gv770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc481: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv770, R.dtype("float16")) _479: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_27_fc2_weight2, alloc480, model_decoder_layers_27_fc2_bias2, alloc481) R.vm.kill_object(alloc480) R.vm.kill_object(model_decoder_layers_27_fc2_weight2) R.vm.kill_object(model_decoder_layers_27_fc2_bias2) gv771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc482: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv771, R.dtype("float16")) cls.add5(alloc478, alloc481, alloc482) R.vm.kill_object(alloc478) R.vm.kill_object(alloc481) model_decoder_layers_28_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1168] model_decoder_layers_28_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1169] gv772: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc483: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv772, R.dtype("float16")) cls.layer_norm2(alloc482, model_decoder_layers_28_self_attn_layer_norm_weight2, model_decoder_layers_28_self_attn_layer_norm_bias2, alloc483) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias2) model_decoder_layers_28_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] model_decoder_layers_28_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1165] gv773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc484: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv773, R.dtype("float16")) _482: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_q_proj_weight2, alloc483, model_decoder_layers_28_self_attn_q_proj_bias2, alloc484) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias2) gv774: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape667: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc484, gv774, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc484) model_decoder_layers_28_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] gv775: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc485: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv775, R.dtype("float16")) _483: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_28_self_attn_k_proj_weight2, alloc483, alloc485) R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight2) gv776: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape668: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc485, gv776, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc485) model_decoder_layers_28_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] model_decoder_layers_28_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1163] gv777: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc486: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv777, R.dtype("float16")) _484: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_v_proj_weight2, alloc483, model_decoder_layers_28_self_attn_v_proj_bias2, alloc486) R.vm.kill_object(alloc483) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias2) gv778: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape669: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc486, gv778, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc486) gv779: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc487: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv779, R.dtype("float16")) cls.concatenate1(reshape667, reshape668, reshape669, alloc487) R.vm.kill_object(reshape667) R.vm.kill_object(reshape668) R.vm.kill_object(reshape669) gv780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape670: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc487, gv780, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc487) gv781: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc488: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv781, R.dtype("float16")) _486: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape670, alloc488) R.vm.kill_object(reshape670) gv782: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape671: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc488, gv782, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc488) gv783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape672: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape671, gv783, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape671) model_decoder_layers_28_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] model_decoder_layers_28_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1167] gv784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc489: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv784, R.dtype("float16")) _487: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_out_proj_weight2, reshape672, model_decoder_layers_28_self_attn_out_proj_bias2, alloc489) R.vm.kill_object(reshape672) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias2) gv785: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc490: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv785, R.dtype("float16")) cls.add5(alloc482, alloc489, alloc490) R.vm.kill_object(alloc482) R.vm.kill_object(alloc489) model_decoder_layers_28_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1177] model_decoder_layers_28_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1178] gv786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc491: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv786, R.dtype("float16")) cls.layer_norm2(alloc490, model_decoder_layers_28_encoder_attn_layer_norm_weight2, model_decoder_layers_28_encoder_attn_layer_norm_bias2, alloc491) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias2) model_decoder_layers_28_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] model_decoder_layers_28_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1174] gv787: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc492: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv787, R.dtype("float16")) _490: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight2, alloc491, model_decoder_layers_28_encoder_attn_q_proj_bias2, alloc492) R.vm.kill_object(alloc491) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias2) gv788: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape673: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc492, gv788, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc492) gv789: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape674: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape673, gv789, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape673) gv790: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc493: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv790, R.dtype("float16")) _491: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape674, alloc493) R.vm.kill_object(reshape674) gv791: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape675: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc493, gv791, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc493) gv792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape676: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape675, gv792, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape675) model_decoder_layers_28_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] model_decoder_layers_28_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1176] gv793: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc494: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv793, R.dtype("float16")) _492: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight2, reshape676, model_decoder_layers_28_encoder_attn_out_proj_bias2, alloc494) R.vm.kill_object(reshape676) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias2) gv794: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc495: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv794, R.dtype("float16")) cls.add5(alloc490, alloc494, alloc495) R.vm.kill_object(alloc490) R.vm.kill_object(alloc494) model_decoder_layers_28_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1183] model_decoder_layers_28_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1184] gv795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc496: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv795, R.dtype("float16")) cls.layer_norm2(alloc495, model_decoder_layers_28_final_layer_norm_weight2, model_decoder_layers_28_final_layer_norm_bias2, alloc496) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias2) model_decoder_layers_28_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] model_decoder_layers_28_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1180] gv796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc497: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv796, R.dtype("float16")) _495: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_28_fc1_weight2, alloc496, model_decoder_layers_28_fc1_bias2, alloc497) R.vm.kill_object(alloc496) R.vm.kill_object(model_decoder_layers_28_fc1_weight2) R.vm.kill_object(model_decoder_layers_28_fc1_bias2) model_decoder_layers_28_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] model_decoder_layers_28_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1182] gv797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc498: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv797, R.dtype("float16")) _496: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_28_fc2_weight2, alloc497, model_decoder_layers_28_fc2_bias2, alloc498) R.vm.kill_object(alloc497) R.vm.kill_object(model_decoder_layers_28_fc2_weight2) R.vm.kill_object(model_decoder_layers_28_fc2_bias2) gv798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc499: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv798, R.dtype("float16")) cls.add5(alloc495, alloc498, alloc499) R.vm.kill_object(alloc495) R.vm.kill_object(alloc498) model_decoder_layers_29_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1192] model_decoder_layers_29_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1193] gv799: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc500: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv799, R.dtype("float16")) cls.layer_norm2(alloc499, model_decoder_layers_29_self_attn_layer_norm_weight2, model_decoder_layers_29_self_attn_layer_norm_bias2, alloc500) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias2) model_decoder_layers_29_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] model_decoder_layers_29_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1189] gv800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc501: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv800, R.dtype("float16")) _499: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_q_proj_weight2, alloc500, model_decoder_layers_29_self_attn_q_proj_bias2, alloc501) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias2) gv801: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape677: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc501, gv801, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc501) model_decoder_layers_29_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] gv802: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc502: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv802, R.dtype("float16")) _500: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_29_self_attn_k_proj_weight2, alloc500, alloc502) R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight2) gv803: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape678: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc502, gv803, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc502) model_decoder_layers_29_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] model_decoder_layers_29_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1187] gv804: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc503: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv804, R.dtype("float16")) _501: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_v_proj_weight2, alloc500, model_decoder_layers_29_self_attn_v_proj_bias2, alloc503) R.vm.kill_object(alloc500) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias2) gv805: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape679: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc503, gv805, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc503) gv806: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc504: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv806, R.dtype("float16")) cls.concatenate1(reshape677, reshape678, reshape679, alloc504) R.vm.kill_object(reshape677) R.vm.kill_object(reshape678) R.vm.kill_object(reshape679) gv807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape680: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc504, gv807, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc504) gv808: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc505: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv808, R.dtype("float16")) _503: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape680, alloc505) R.vm.kill_object(reshape680) gv809: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape681: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc505, gv809, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc505) gv810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape682: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape681, gv810, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape681) model_decoder_layers_29_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] model_decoder_layers_29_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1191] gv811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc506: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv811, R.dtype("float16")) _504: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_out_proj_weight2, reshape682, model_decoder_layers_29_self_attn_out_proj_bias2, alloc506) R.vm.kill_object(reshape682) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias2) gv812: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc507: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv812, R.dtype("float16")) cls.add5(alloc499, alloc506, alloc507) R.vm.kill_object(alloc499) R.vm.kill_object(alloc506) model_decoder_layers_29_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1201] model_decoder_layers_29_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1202] gv813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc508: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv813, R.dtype("float16")) cls.layer_norm2(alloc507, model_decoder_layers_29_encoder_attn_layer_norm_weight2, model_decoder_layers_29_encoder_attn_layer_norm_bias2, alloc508) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias2) model_decoder_layers_29_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] model_decoder_layers_29_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1198] gv814: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc509: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv814, R.dtype("float16")) _507: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight2, alloc508, model_decoder_layers_29_encoder_attn_q_proj_bias2, alloc509) R.vm.kill_object(alloc508) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias2) gv815: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape683: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc509, gv815, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc509) gv816: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape684: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape683, gv816, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape683) gv817: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc510: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv817, R.dtype("float16")) _508: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape684, alloc510) R.vm.kill_object(reshape684) gv818: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape685: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc510, gv818, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc510) gv819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape686: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape685, gv819, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape685) model_decoder_layers_29_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] model_decoder_layers_29_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1200] gv820: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc511: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv820, R.dtype("float16")) _509: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight2, reshape686, model_decoder_layers_29_encoder_attn_out_proj_bias2, alloc511) R.vm.kill_object(reshape686) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias2) gv821: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc512: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv821, R.dtype("float16")) cls.add5(alloc507, alloc511, alloc512) R.vm.kill_object(alloc507) R.vm.kill_object(alloc511) model_decoder_layers_29_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1207] model_decoder_layers_29_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1208] gv822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc513: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv822, R.dtype("float16")) cls.layer_norm2(alloc512, model_decoder_layers_29_final_layer_norm_weight2, model_decoder_layers_29_final_layer_norm_bias2, alloc513) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias2) model_decoder_layers_29_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] model_decoder_layers_29_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1204] gv823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc514: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv823, R.dtype("float16")) _512: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_29_fc1_weight2, alloc513, model_decoder_layers_29_fc1_bias2, alloc514) R.vm.kill_object(alloc513) R.vm.kill_object(model_decoder_layers_29_fc1_weight2) R.vm.kill_object(model_decoder_layers_29_fc1_bias2) model_decoder_layers_29_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] model_decoder_layers_29_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1206] gv824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc515: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv824, R.dtype("float16")) _513: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_29_fc2_weight2, alloc514, model_decoder_layers_29_fc2_bias2, alloc515) R.vm.kill_object(alloc514) R.vm.kill_object(model_decoder_layers_29_fc2_weight2) R.vm.kill_object(model_decoder_layers_29_fc2_bias2) gv825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc516: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv825, R.dtype("float16")) cls.add5(alloc512, alloc515, alloc516) R.vm.kill_object(alloc512) R.vm.kill_object(alloc515) model_decoder_layers_30_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1216] model_decoder_layers_30_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1217] gv826: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc517: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv826, R.dtype("float16")) cls.layer_norm2(alloc516, model_decoder_layers_30_self_attn_layer_norm_weight2, model_decoder_layers_30_self_attn_layer_norm_bias2, alloc517) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias2) model_decoder_layers_30_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] model_decoder_layers_30_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1213] gv827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc518: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv827, R.dtype("float16")) _516: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_q_proj_weight2, alloc517, model_decoder_layers_30_self_attn_q_proj_bias2, alloc518) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias2) gv828: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape687: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc518, gv828, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc518) model_decoder_layers_30_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] gv829: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc519: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv829, R.dtype("float16")) _517: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_30_self_attn_k_proj_weight2, alloc517, alloc519) R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight2) gv830: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape688: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc519, gv830, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc519) model_decoder_layers_30_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] model_decoder_layers_30_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1211] gv831: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc520: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv831, R.dtype("float16")) _518: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_v_proj_weight2, alloc517, model_decoder_layers_30_self_attn_v_proj_bias2, alloc520) R.vm.kill_object(alloc517) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias2) gv832: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape689: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc520, gv832, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc520) gv833: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc521: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv833, R.dtype("float16")) cls.concatenate1(reshape687, reshape688, reshape689, alloc521) R.vm.kill_object(reshape687) R.vm.kill_object(reshape688) R.vm.kill_object(reshape689) gv834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape690: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc521, gv834, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc521) gv835: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc522: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv835, R.dtype("float16")) _520: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape690, alloc522) R.vm.kill_object(reshape690) gv836: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape691: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc522, gv836, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc522) gv837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape692: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape691, gv837, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape691) model_decoder_layers_30_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] model_decoder_layers_30_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1215] gv838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc523: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv838, R.dtype("float16")) _521: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_out_proj_weight2, reshape692, model_decoder_layers_30_self_attn_out_proj_bias2, alloc523) R.vm.kill_object(reshape692) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias2) gv839: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc524: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv839, R.dtype("float16")) cls.add5(alloc516, alloc523, alloc524) R.vm.kill_object(alloc516) R.vm.kill_object(alloc523) model_decoder_layers_30_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1225] model_decoder_layers_30_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1226] gv840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc525: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv840, R.dtype("float16")) cls.layer_norm2(alloc524, model_decoder_layers_30_encoder_attn_layer_norm_weight2, model_decoder_layers_30_encoder_attn_layer_norm_bias2, alloc525) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias2) model_decoder_layers_30_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] model_decoder_layers_30_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1222] gv841: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc526: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv841, R.dtype("float16")) _524: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight2, alloc525, model_decoder_layers_30_encoder_attn_q_proj_bias2, alloc526) R.vm.kill_object(alloc525) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias2) gv842: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape693: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc526, gv842, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc526) gv843: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape694: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape693, gv843, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape693) gv844: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc527: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv844, R.dtype("float16")) _525: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape694, alloc527) R.vm.kill_object(reshape694) gv845: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape695: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc527, gv845, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc527) gv846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape696: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape695, gv846, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape695) model_decoder_layers_30_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] model_decoder_layers_30_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1224] gv847: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc528: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv847, R.dtype("float16")) _526: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight2, reshape696, model_decoder_layers_30_encoder_attn_out_proj_bias2, alloc528) R.vm.kill_object(reshape696) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias2) gv848: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc529: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv848, R.dtype("float16")) cls.add5(alloc524, alloc528, alloc529) R.vm.kill_object(alloc524) R.vm.kill_object(alloc528) model_decoder_layers_30_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1231] model_decoder_layers_30_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1232] gv849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc530: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv849, R.dtype("float16")) cls.layer_norm2(alloc529, model_decoder_layers_30_final_layer_norm_weight2, model_decoder_layers_30_final_layer_norm_bias2, alloc530) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias2) model_decoder_layers_30_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] model_decoder_layers_30_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1228] gv850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc531: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv850, R.dtype("float16")) _529: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_30_fc1_weight2, alloc530, model_decoder_layers_30_fc1_bias2, alloc531) R.vm.kill_object(alloc530) R.vm.kill_object(model_decoder_layers_30_fc1_weight2) R.vm.kill_object(model_decoder_layers_30_fc1_bias2) model_decoder_layers_30_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] model_decoder_layers_30_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1230] gv851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc532: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv851, R.dtype("float16")) _530: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_30_fc2_weight2, alloc531, model_decoder_layers_30_fc2_bias2, alloc532) R.vm.kill_object(alloc531) R.vm.kill_object(model_decoder_layers_30_fc2_weight2) R.vm.kill_object(model_decoder_layers_30_fc2_bias2) gv852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc533: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv852, R.dtype("float16")) cls.add5(alloc529, alloc532, alloc533) R.vm.kill_object(alloc529) R.vm.kill_object(alloc532) model_decoder_layers_31_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1240] model_decoder_layers_31_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1241] gv853: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc534: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv853, R.dtype("float16")) cls.layer_norm2(alloc533, model_decoder_layers_31_self_attn_layer_norm_weight2, model_decoder_layers_31_self_attn_layer_norm_bias2, alloc534) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias2) model_decoder_layers_31_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] model_decoder_layers_31_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1237] gv854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc535: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv854, R.dtype("float16")) _533: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_q_proj_weight2, alloc534, model_decoder_layers_31_self_attn_q_proj_bias2, alloc535) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias2) gv855: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape697: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc535, gv855, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc535) model_decoder_layers_31_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] gv856: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc536: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv856, R.dtype("float16")) _534: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_31_self_attn_k_proj_weight2, alloc534, alloc536) R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight2) gv857: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape698: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc536, gv857, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc536) model_decoder_layers_31_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] model_decoder_layers_31_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1235] gv858: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc537: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv858, R.dtype("float16")) _535: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_v_proj_weight2, alloc534, model_decoder_layers_31_self_attn_v_proj_bias2, alloc537) R.vm.kill_object(alloc534) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight2) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias2) gv859: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape699: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc537, gv859, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc537) gv860: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc538: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv860, R.dtype("float16")) cls.concatenate1(reshape697, reshape698, reshape699, alloc538) R.vm.kill_object(reshape697) R.vm.kill_object(reshape698) R.vm.kill_object(reshape699) gv861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape700: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc538, gv861, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc538) gv862: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc539: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv862, R.dtype("float16")) _537: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape700, alloc539) R.vm.kill_object(reshape700) gv863: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape701: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc539, gv863, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc539) gv864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape702: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape701, gv864, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape701) model_decoder_layers_31_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] model_decoder_layers_31_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1239] gv865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc540: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv865, R.dtype("float16")) _538: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_out_proj_weight2, reshape702, model_decoder_layers_31_self_attn_out_proj_bias2, alloc540) R.vm.kill_object(reshape702) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias2) gv866: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc541: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv866, R.dtype("float16")) cls.add5(alloc533, alloc540, alloc541) R.vm.kill_object(alloc533) R.vm.kill_object(alloc540) model_decoder_layers_31_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1249] model_decoder_layers_31_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1250] gv867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc542: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv867, R.dtype("float16")) cls.layer_norm2(alloc541, model_decoder_layers_31_encoder_attn_layer_norm_weight2, model_decoder_layers_31_encoder_attn_layer_norm_bias2, alloc542) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias2) model_decoder_layers_31_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] model_decoder_layers_31_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1246] gv868: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc543: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv868, R.dtype("float16")) _541: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight2, alloc542, model_decoder_layers_31_encoder_attn_q_proj_bias2, alloc543) R.vm.kill_object(alloc542) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight2) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias2) gv869: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape703: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc543, gv869, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc543) gv870: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape704: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape703, gv870, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape703) gv871: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc544: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv871, R.dtype("float16")) _542: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape704, alloc544) R.vm.kill_object(reshape704) gv872: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape705: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc544, gv872, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc544) gv873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape706: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape705, gv873, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape705) model_decoder_layers_31_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] model_decoder_layers_31_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1248] gv874: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc545: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv874, R.dtype("float16")) _543: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight2, reshape706, model_decoder_layers_31_encoder_attn_out_proj_bias2, alloc545) R.vm.kill_object(reshape706) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight2) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias2) gv875: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc546: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv875, R.dtype("float16")) R.vm.kill_object(storage6) cls.add5(alloc541, alloc545, alloc546) R.vm.kill_object(alloc541) R.vm.kill_object(alloc545) model_decoder_layers_31_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1255] model_decoder_layers_31_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1256] gv876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc547: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv876, R.dtype("float16")) cls.layer_norm2(alloc546, model_decoder_layers_31_final_layer_norm_weight2, model_decoder_layers_31_final_layer_norm_bias2, alloc547) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight2) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias2) model_decoder_layers_31_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] model_decoder_layers_31_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1252] gv877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc548: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv877, R.dtype("float16")) R.vm.kill_object(storage4) _546: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_31_fc1_weight2, alloc547, model_decoder_layers_31_fc1_bias2, alloc548) R.vm.kill_object(alloc547) R.vm.kill_object(model_decoder_layers_31_fc1_weight2) R.vm.kill_object(model_decoder_layers_31_fc1_bias2) model_decoder_layers_31_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] model_decoder_layers_31_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1254] gv878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc549: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv878, R.dtype("float16")) R.vm.kill_object(storage5) _547: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_31_fc2_weight2, alloc548, model_decoder_layers_31_fc2_bias2, alloc549) R.vm.kill_object(alloc548) R.vm.kill_object(model_decoder_layers_31_fc2_weight2) R.vm.kill_object(model_decoder_layers_31_fc2_bias2) gv879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc550: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv879, R.dtype("float16")) R.vm.kill_object(storage7) cls.add5(alloc546, alloc549, alloc550) R.vm.kill_object(alloc546) R.vm.kill_object(alloc549) model_decoder_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1257] model_decoder_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1258] gv880: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc551: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv880, R.dtype("float16")) R.vm.kill_object(storage8) cls.layer_norm2(alloc550, model_decoder_layer_norm_weight2, model_decoder_layer_norm_bias2, alloc551) R.vm.kill_object(alloc550) R.vm.kill_object(model_decoder_layer_norm_weight2) R.vm.kill_object(model_decoder_layer_norm_bias2) storage9: R.Object = R.vm.alloc_storage(R.shape([20480]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc552: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage9, R.prim_value(0), gv881, R.dtype("float16")) R.vm.kill_object(storage9) cls.take2(alloc551, logit_positions, alloc552) R.vm.kill_object(alloc551) storage10: R.Object = R.vm.alloc_storage(R.shape([1659712]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(51866), sinfo_args=(R.Shape(ndim=3),)) alloc553: R.Tensor(dtype="float32", ndim=3) = R.vm.alloc_tensor(storage10, R.prim_value(0), gv882, R.dtype("float32")) R.vm.kill_object(storage10) _551: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul5_cublas", model_decoder_embed_tokens_weight2, alloc552, alloc553) R.vm.kill_object(model_decoder_embed_tokens_weight2) R.vm.kill_object(alloc552) R.call_packed("vm.builtin.match_shape", alloc553, shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(51866), R.str("ErrorContext(fn=batch_prefill, loc=return, annotation=R.Tensor((1, batch_size, 51866), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) return alloc553 @R.function def create_tir_paged_kv_cache(max_batch_size_: R.Shape(["max_batch_size"]), max_total_seq_len_: R.Shape(["max_total_seq_len"]), prefill_chunk_size_: R.Shape(["prefill_chunk_size"]), page_size_: R.Shape(["page_size"]), support_sliding_window_: R.Shape(["support_sliding_window"])) -> R.Object: max_batch_size = T.int64() max_total_seq_len = T.int64() prefill_chunk_size = T.int64() page_size = T.int64() support_sliding_window = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_shape_info", max_batch_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[0], param=max_batch_size_, annotation=R.Shape([max_batch_size])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_shape_info", max_total_seq_len_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[1], param=max_total_seq_len_, annotation=R.Shape([max_total_seq_len])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_shape_info", prefill_chunk_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[2], param=prefill_chunk_size_, annotation=R.Shape([prefill_chunk_size])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_shape_info", page_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[3], param=page_size_, annotation=R.Shape([page_size])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_shape_info", support_sliding_window_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[4], param=support_sliding_window_, annotation=R.Shape([support_sliding_window])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", max_batch_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[0], param=max_batch_size_, annotation=R.Shape([max_batch_size])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", max_total_seq_len_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[1], param=max_total_seq_len_, annotation=R.Shape([max_total_seq_len])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", prefill_chunk_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[2], param=prefill_chunk_size_, annotation=R.Shape([prefill_chunk_size])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", page_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[3], param=page_size_, annotation=R.Shape([page_size])) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", support_sliding_window_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[4], param=support_sliding_window_, annotation=R.Shape([support_sliding_window])) "), sinfo_args=(R.Tuple,)) gv2559: R.Shape(ndim=5) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(5), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(2), R.prim_value(1), R.prim_value(3), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=5),)) paged_kv_cache: R.Object = R.call_packed("vm.builtin.paged_attention_kv_cache_create_reduced", gv2559, R.prim_value(32), R.prim_value(20), R.prim_value(20), R.prim_value(64), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.const(0, "float16"), cls.tir_kv_cache_transpose_append, cls.batch_prefill_paged_kv, cls.batch_decode_paged_kv, cls.batch_prefill_paged_kv_sliding_window, cls.batch_decode_paged_kv_sliding_window, cls.batch_prefill_ragged_kv, cls.merge_state_inplace, cls.fused_rope, cls.copy_single_page, cls.tir_kv_cache_debug_get_kv, cls.compact_kv_copy, cls.batch_tree_attn, sinfo_args=(R.Object,)) return paged_kv_cache @R.function def decode(input_ids: R.Tensor((1, 1), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, 1, 51866), dtype="float32"): R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(1),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=decode, loc=param[0], param=input_ids, annotation=R.Tensor((1, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=decode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.str("ErrorContext(fn=decode, loc=param[0], param=input_ids, annotation=R.Tensor((1, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) model_decoder_embed_tokens_weight5: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] reshape1353: R.Tensor((1,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, R.shape([1]), sinfo_args=(R.Tensor((1,), dtype="int32"),)) model_decoder_embed_tokens_weight5_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] storage19: R.Object = R.vm.alloc_storage(R.shape([10240]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc1167: R.Tensor((1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1280]), R.dtype("float16")) cls.take3(model_decoder_embed_tokens_weight5_1, reshape1353, alloc1167) R.vm.kill_object(reshape1353) R.vm.kill_object(model_decoder_embed_tokens_weight5_1) lv264: R.Tensor((1,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((1,), dtype="int32"),)) model_decoder_embed_positions_weight5: R.Tensor((448, 1280), dtype="float16") = packed_params[488] storage20: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc1168: R.Tensor((1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1280]), R.dtype("float16")) cls.take4(model_decoder_embed_positions_weight5, lv264, alloc1168) R.vm.kill_object(lv264) R.vm.kill_object(model_decoder_embed_positions_weight5) storage21: R.Object = R.vm.alloc_storage(R.shape([2560]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc1169: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_reshape20_reshape20_add6(alloc1167, alloc1168, alloc1169) R.vm.kill_object(alloc1167) R.vm.kill_object(alloc1168) model_decoder_layers_0_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[496] model_decoder_layers_0_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[497] alloc1170: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1169, model_decoder_layers_0_self_attn_layer_norm_weight5, model_decoder_layers_0_self_attn_layer_norm_bias5, alloc1170) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias5) model_decoder_layers_0_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] model_decoder_layers_0_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[493] alloc1171: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1170, model_decoder_layers_0_self_attn_q_proj_weight5, model_decoder_layers_0_self_attn_q_proj_bias5, alloc1171) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias5) model_decoder_layers_0_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] storage22: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc1172: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1170, model_decoder_layers_0_self_attn_k_proj_weight5, alloc1172) R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight5) model_decoder_layers_0_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] model_decoder_layers_0_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[491] storage23: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc1173: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1170, model_decoder_layers_0_self_attn_v_proj_weight5, model_decoder_layers_0_self_attn_v_proj_bias5, alloc1173) R.vm.kill_object(alloc1170) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias5) alloc1174: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1171, alloc1172, alloc1173, alloc1174) R.vm.kill_object(alloc1171) R.vm.kill_object(alloc1172) R.vm.kill_object(alloc1173) alloc1175: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1173: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), alloc1174, alloc1175) R.vm.kill_object(alloc1174) lv44: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1175, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1175) model_decoder_layers_0_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] model_decoder_layers_0_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[495] alloc1176: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv44, model_decoder_layers_0_self_attn_out_proj_weight5, model_decoder_layers_0_self_attn_out_proj_bias5, alloc1169, alloc1176) R.vm.kill_object(alloc1169) R.vm.kill_object(lv44) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias5) model_decoder_layers_0_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[505] model_decoder_layers_0_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[506] alloc1177: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1176, model_decoder_layers_0_encoder_attn_layer_norm_weight5, model_decoder_layers_0_encoder_attn_layer_norm_bias5, alloc1177) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias5) model_decoder_layers_0_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] model_decoder_layers_0_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[502] alloc1178: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1177, model_decoder_layers_0_encoder_attn_q_proj_weight5, model_decoder_layers_0_encoder_attn_q_proj_bias5, alloc1178) R.vm.kill_object(alloc1177) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias5) lv47: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1178, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1178) alloc1179: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1177: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), lv47, alloc1179) R.vm.kill_object(lv47) lv48: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1179, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1179) model_decoder_layers_0_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] model_decoder_layers_0_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[504] alloc1180: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv48, model_decoder_layers_0_encoder_attn_out_proj_weight5, model_decoder_layers_0_encoder_attn_out_proj_bias5, alloc1176, alloc1180) R.vm.kill_object(alloc1176) R.vm.kill_object(lv48) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias5) model_decoder_layers_0_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[511] model_decoder_layers_0_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[512] alloc1181: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1180, model_decoder_layers_0_final_layer_norm_weight5, model_decoder_layers_0_final_layer_norm_bias5, alloc1181) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias5) model_decoder_layers_0_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] model_decoder_layers_0_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[508] alloc1182: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1181, model_decoder_layers_0_fc1_weight5, model_decoder_layers_0_fc1_bias5, alloc1182) R.vm.kill_object(alloc1181) R.vm.kill_object(model_decoder_layers_0_fc1_weight5) R.vm.kill_object(model_decoder_layers_0_fc1_bias5) model_decoder_layers_0_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] model_decoder_layers_0_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[510] alloc1183: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1182, model_decoder_layers_0_fc2_weight5, model_decoder_layers_0_fc2_bias5, alloc1180, alloc1183) R.vm.kill_object(alloc1180) R.vm.kill_object(alloc1182) R.vm.kill_object(model_decoder_layers_0_fc2_weight5) R.vm.kill_object(model_decoder_layers_0_fc2_bias5) model_decoder_layers_1_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[520] model_decoder_layers_1_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[521] alloc1184: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1183, model_decoder_layers_1_self_attn_layer_norm_weight5, model_decoder_layers_1_self_attn_layer_norm_bias5, alloc1184) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias5) model_decoder_layers_1_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] model_decoder_layers_1_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[517] alloc1185: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1184, model_decoder_layers_1_self_attn_q_proj_weight5, model_decoder_layers_1_self_attn_q_proj_bias5, alloc1185) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias5) model_decoder_layers_1_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] alloc1186: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1184, model_decoder_layers_1_self_attn_k_proj_weight5, alloc1186) R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight5) model_decoder_layers_1_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] model_decoder_layers_1_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[515] alloc1187: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1184, model_decoder_layers_1_self_attn_v_proj_weight5, model_decoder_layers_1_self_attn_v_proj_bias5, alloc1187) R.vm.kill_object(alloc1184) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias5) alloc1188: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1185, alloc1186, alloc1187, alloc1188) R.vm.kill_object(alloc1185) R.vm.kill_object(alloc1186) R.vm.kill_object(alloc1187) alloc1189: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1187: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), alloc1188, alloc1189) R.vm.kill_object(alloc1188) lv55: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1189, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1189) model_decoder_layers_1_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] model_decoder_layers_1_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[519] alloc1190: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv55, model_decoder_layers_1_self_attn_out_proj_weight5, model_decoder_layers_1_self_attn_out_proj_bias5, alloc1183, alloc1190) R.vm.kill_object(alloc1183) R.vm.kill_object(lv55) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias5) model_decoder_layers_1_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[529] model_decoder_layers_1_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[530] alloc1191: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1190, model_decoder_layers_1_encoder_attn_layer_norm_weight5, model_decoder_layers_1_encoder_attn_layer_norm_bias5, alloc1191) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias5) model_decoder_layers_1_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] model_decoder_layers_1_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[526] alloc1192: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1191, model_decoder_layers_1_encoder_attn_q_proj_weight5, model_decoder_layers_1_encoder_attn_q_proj_bias5, alloc1192) R.vm.kill_object(alloc1191) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias5) lv58: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1192, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1192) alloc1193: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1191: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), lv58, alloc1193) R.vm.kill_object(lv58) lv59: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1193, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1193) model_decoder_layers_1_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] model_decoder_layers_1_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[528] alloc1194: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv59, model_decoder_layers_1_encoder_attn_out_proj_weight5, model_decoder_layers_1_encoder_attn_out_proj_bias5, alloc1190, alloc1194) R.vm.kill_object(alloc1190) R.vm.kill_object(lv59) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias5) model_decoder_layers_1_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[535] model_decoder_layers_1_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[536] alloc1195: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1194, model_decoder_layers_1_final_layer_norm_weight5, model_decoder_layers_1_final_layer_norm_bias5, alloc1195) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias5) model_decoder_layers_1_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] model_decoder_layers_1_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[532] alloc1196: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1195, model_decoder_layers_1_fc1_weight5, model_decoder_layers_1_fc1_bias5, alloc1196) R.vm.kill_object(alloc1195) R.vm.kill_object(model_decoder_layers_1_fc1_weight5) R.vm.kill_object(model_decoder_layers_1_fc1_bias5) model_decoder_layers_1_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] model_decoder_layers_1_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[534] alloc1197: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1196, model_decoder_layers_1_fc2_weight5, model_decoder_layers_1_fc2_bias5, alloc1194, alloc1197) R.vm.kill_object(alloc1194) R.vm.kill_object(alloc1196) R.vm.kill_object(model_decoder_layers_1_fc2_weight5) R.vm.kill_object(model_decoder_layers_1_fc2_bias5) model_decoder_layers_2_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[544] model_decoder_layers_2_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[545] alloc1198: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1197, model_decoder_layers_2_self_attn_layer_norm_weight5, model_decoder_layers_2_self_attn_layer_norm_bias5, alloc1198) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias5) model_decoder_layers_2_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] model_decoder_layers_2_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[541] alloc1199: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1198, model_decoder_layers_2_self_attn_q_proj_weight5, model_decoder_layers_2_self_attn_q_proj_bias5, alloc1199) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias5) model_decoder_layers_2_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] alloc1200: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1198, model_decoder_layers_2_self_attn_k_proj_weight5, alloc1200) R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight5) model_decoder_layers_2_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] model_decoder_layers_2_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[539] alloc1201: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1198, model_decoder_layers_2_self_attn_v_proj_weight5, model_decoder_layers_2_self_attn_v_proj_bias5, alloc1201) R.vm.kill_object(alloc1198) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias5) alloc1202: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1199, alloc1200, alloc1201, alloc1202) R.vm.kill_object(alloc1199) R.vm.kill_object(alloc1200) R.vm.kill_object(alloc1201) alloc1203: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1201: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), alloc1202, alloc1203) R.vm.kill_object(alloc1202) lv66: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1203, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1203) model_decoder_layers_2_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] model_decoder_layers_2_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[543] alloc1204: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv66, model_decoder_layers_2_self_attn_out_proj_weight5, model_decoder_layers_2_self_attn_out_proj_bias5, alloc1197, alloc1204) R.vm.kill_object(alloc1197) R.vm.kill_object(lv66) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias5) model_decoder_layers_2_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[553] model_decoder_layers_2_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[554] alloc1205: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1204, model_decoder_layers_2_encoder_attn_layer_norm_weight5, model_decoder_layers_2_encoder_attn_layer_norm_bias5, alloc1205) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias5) model_decoder_layers_2_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] model_decoder_layers_2_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[550] alloc1206: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1205, model_decoder_layers_2_encoder_attn_q_proj_weight5, model_decoder_layers_2_encoder_attn_q_proj_bias5, alloc1206) R.vm.kill_object(alloc1205) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias5) lv69: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1206, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1206) alloc1207: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1205: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), lv69, alloc1207) R.vm.kill_object(lv69) lv70: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1207, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1207) model_decoder_layers_2_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] model_decoder_layers_2_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[552] alloc1208: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv70, model_decoder_layers_2_encoder_attn_out_proj_weight5, model_decoder_layers_2_encoder_attn_out_proj_bias5, alloc1204, alloc1208) R.vm.kill_object(alloc1204) R.vm.kill_object(lv70) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias5) model_decoder_layers_2_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[559] model_decoder_layers_2_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[560] alloc1209: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1208, model_decoder_layers_2_final_layer_norm_weight5, model_decoder_layers_2_final_layer_norm_bias5, alloc1209) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias5) model_decoder_layers_2_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] model_decoder_layers_2_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[556] alloc1210: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1209, model_decoder_layers_2_fc1_weight5, model_decoder_layers_2_fc1_bias5, alloc1210) R.vm.kill_object(alloc1209) R.vm.kill_object(model_decoder_layers_2_fc1_weight5) R.vm.kill_object(model_decoder_layers_2_fc1_bias5) model_decoder_layers_2_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] model_decoder_layers_2_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[558] alloc1211: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1210, model_decoder_layers_2_fc2_weight5, model_decoder_layers_2_fc2_bias5, alloc1208, alloc1211) R.vm.kill_object(alloc1208) R.vm.kill_object(alloc1210) R.vm.kill_object(model_decoder_layers_2_fc2_weight5) R.vm.kill_object(model_decoder_layers_2_fc2_bias5) model_decoder_layers_3_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[568] model_decoder_layers_3_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[569] alloc1212: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1211, model_decoder_layers_3_self_attn_layer_norm_weight5, model_decoder_layers_3_self_attn_layer_norm_bias5, alloc1212) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias5) model_decoder_layers_3_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] model_decoder_layers_3_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[565] alloc1213: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1212, model_decoder_layers_3_self_attn_q_proj_weight5, model_decoder_layers_3_self_attn_q_proj_bias5, alloc1213) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias5) model_decoder_layers_3_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] alloc1214: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1212, model_decoder_layers_3_self_attn_k_proj_weight5, alloc1214) R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight5) model_decoder_layers_3_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] model_decoder_layers_3_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[563] alloc1215: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1212, model_decoder_layers_3_self_attn_v_proj_weight5, model_decoder_layers_3_self_attn_v_proj_bias5, alloc1215) R.vm.kill_object(alloc1212) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias5) alloc1216: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1213, alloc1214, alloc1215, alloc1216) R.vm.kill_object(alloc1213) R.vm.kill_object(alloc1214) R.vm.kill_object(alloc1215) alloc1217: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1215: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), alloc1216, alloc1217) R.vm.kill_object(alloc1216) lv77: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1217, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1217) model_decoder_layers_3_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] model_decoder_layers_3_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[567] alloc1218: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv77, model_decoder_layers_3_self_attn_out_proj_weight5, model_decoder_layers_3_self_attn_out_proj_bias5, alloc1211, alloc1218) R.vm.kill_object(alloc1211) R.vm.kill_object(lv77) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias5) model_decoder_layers_3_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[577] model_decoder_layers_3_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[578] alloc1219: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1218, model_decoder_layers_3_encoder_attn_layer_norm_weight5, model_decoder_layers_3_encoder_attn_layer_norm_bias5, alloc1219) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias5) model_decoder_layers_3_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] model_decoder_layers_3_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[574] alloc1220: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1219, model_decoder_layers_3_encoder_attn_q_proj_weight5, model_decoder_layers_3_encoder_attn_q_proj_bias5, alloc1220) R.vm.kill_object(alloc1219) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias5) lv80: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1220, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1220) alloc1221: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1219: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), lv80, alloc1221) R.vm.kill_object(lv80) lv81: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1221, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1221) model_decoder_layers_3_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] model_decoder_layers_3_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[576] alloc1222: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv81, model_decoder_layers_3_encoder_attn_out_proj_weight5, model_decoder_layers_3_encoder_attn_out_proj_bias5, alloc1218, alloc1222) R.vm.kill_object(alloc1218) R.vm.kill_object(lv81) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias5) model_decoder_layers_3_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[583] model_decoder_layers_3_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[584] alloc1223: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1222, model_decoder_layers_3_final_layer_norm_weight5, model_decoder_layers_3_final_layer_norm_bias5, alloc1223) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias5) model_decoder_layers_3_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] model_decoder_layers_3_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[580] alloc1224: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1223, model_decoder_layers_3_fc1_weight5, model_decoder_layers_3_fc1_bias5, alloc1224) R.vm.kill_object(alloc1223) R.vm.kill_object(model_decoder_layers_3_fc1_weight5) R.vm.kill_object(model_decoder_layers_3_fc1_bias5) model_decoder_layers_3_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] model_decoder_layers_3_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[582] alloc1225: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1224, model_decoder_layers_3_fc2_weight5, model_decoder_layers_3_fc2_bias5, alloc1222, alloc1225) R.vm.kill_object(alloc1222) R.vm.kill_object(alloc1224) R.vm.kill_object(model_decoder_layers_3_fc2_weight5) R.vm.kill_object(model_decoder_layers_3_fc2_bias5) model_decoder_layers_4_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[592] model_decoder_layers_4_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[593] alloc1226: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1225, model_decoder_layers_4_self_attn_layer_norm_weight5, model_decoder_layers_4_self_attn_layer_norm_bias5, alloc1226) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias5) model_decoder_layers_4_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] model_decoder_layers_4_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[589] alloc1227: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1226, model_decoder_layers_4_self_attn_q_proj_weight5, model_decoder_layers_4_self_attn_q_proj_bias5, alloc1227) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias5) model_decoder_layers_4_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] alloc1228: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1226, model_decoder_layers_4_self_attn_k_proj_weight5, alloc1228) R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight5) model_decoder_layers_4_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] model_decoder_layers_4_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[587] alloc1229: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1226, model_decoder_layers_4_self_attn_v_proj_weight5, model_decoder_layers_4_self_attn_v_proj_bias5, alloc1229) R.vm.kill_object(alloc1226) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias5) alloc1230: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1227, alloc1228, alloc1229, alloc1230) R.vm.kill_object(alloc1227) R.vm.kill_object(alloc1228) R.vm.kill_object(alloc1229) alloc1231: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1229: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), alloc1230, alloc1231) R.vm.kill_object(alloc1230) lv88: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1231, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1231) model_decoder_layers_4_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] model_decoder_layers_4_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[591] alloc1232: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv88, model_decoder_layers_4_self_attn_out_proj_weight5, model_decoder_layers_4_self_attn_out_proj_bias5, alloc1225, alloc1232) R.vm.kill_object(alloc1225) R.vm.kill_object(lv88) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias5) model_decoder_layers_4_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[601] model_decoder_layers_4_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[602] alloc1233: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1232, model_decoder_layers_4_encoder_attn_layer_norm_weight5, model_decoder_layers_4_encoder_attn_layer_norm_bias5, alloc1233) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias5) model_decoder_layers_4_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] model_decoder_layers_4_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[598] alloc1234: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1233, model_decoder_layers_4_encoder_attn_q_proj_weight5, model_decoder_layers_4_encoder_attn_q_proj_bias5, alloc1234) R.vm.kill_object(alloc1233) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias5) lv91: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1234, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1234) alloc1235: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1233: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), lv91, alloc1235) R.vm.kill_object(lv91) lv92: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1235, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1235) model_decoder_layers_4_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] model_decoder_layers_4_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[600] alloc1236: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv92, model_decoder_layers_4_encoder_attn_out_proj_weight5, model_decoder_layers_4_encoder_attn_out_proj_bias5, alloc1232, alloc1236) R.vm.kill_object(alloc1232) R.vm.kill_object(lv92) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias5) model_decoder_layers_4_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[607] model_decoder_layers_4_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[608] alloc1237: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1236, model_decoder_layers_4_final_layer_norm_weight5, model_decoder_layers_4_final_layer_norm_bias5, alloc1237) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias5) model_decoder_layers_4_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] model_decoder_layers_4_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[604] alloc1238: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1237, model_decoder_layers_4_fc1_weight5, model_decoder_layers_4_fc1_bias5, alloc1238) R.vm.kill_object(alloc1237) R.vm.kill_object(model_decoder_layers_4_fc1_weight5) R.vm.kill_object(model_decoder_layers_4_fc1_bias5) model_decoder_layers_4_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] model_decoder_layers_4_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[606] alloc1239: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1238, model_decoder_layers_4_fc2_weight5, model_decoder_layers_4_fc2_bias5, alloc1236, alloc1239) R.vm.kill_object(alloc1236) R.vm.kill_object(alloc1238) R.vm.kill_object(model_decoder_layers_4_fc2_weight5) R.vm.kill_object(model_decoder_layers_4_fc2_bias5) model_decoder_layers_5_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[616] model_decoder_layers_5_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[617] alloc1240: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1239, model_decoder_layers_5_self_attn_layer_norm_weight5, model_decoder_layers_5_self_attn_layer_norm_bias5, alloc1240) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias5) model_decoder_layers_5_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] model_decoder_layers_5_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[613] alloc1241: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1240, model_decoder_layers_5_self_attn_q_proj_weight5, model_decoder_layers_5_self_attn_q_proj_bias5, alloc1241) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias5) model_decoder_layers_5_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] alloc1242: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1240, model_decoder_layers_5_self_attn_k_proj_weight5, alloc1242) R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight5) model_decoder_layers_5_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] model_decoder_layers_5_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[611] alloc1243: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1240, model_decoder_layers_5_self_attn_v_proj_weight5, model_decoder_layers_5_self_attn_v_proj_bias5, alloc1243) R.vm.kill_object(alloc1240) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias5) alloc1244: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1241, alloc1242, alloc1243, alloc1244) R.vm.kill_object(alloc1241) R.vm.kill_object(alloc1242) R.vm.kill_object(alloc1243) alloc1245: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1243: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), alloc1244, alloc1245) R.vm.kill_object(alloc1244) lv99: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1245, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1245) model_decoder_layers_5_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] model_decoder_layers_5_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[615] alloc1246: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv99, model_decoder_layers_5_self_attn_out_proj_weight5, model_decoder_layers_5_self_attn_out_proj_bias5, alloc1239, alloc1246) R.vm.kill_object(alloc1239) R.vm.kill_object(lv99) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias5) model_decoder_layers_5_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[625] model_decoder_layers_5_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[626] alloc1247: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1246, model_decoder_layers_5_encoder_attn_layer_norm_weight5, model_decoder_layers_5_encoder_attn_layer_norm_bias5, alloc1247) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias5) model_decoder_layers_5_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] model_decoder_layers_5_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[622] alloc1248: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1247, model_decoder_layers_5_encoder_attn_q_proj_weight5, model_decoder_layers_5_encoder_attn_q_proj_bias5, alloc1248) R.vm.kill_object(alloc1247) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias5) lv102: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1248, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1248) alloc1249: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1247: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), lv102, alloc1249) R.vm.kill_object(lv102) lv103: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1249, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1249) model_decoder_layers_5_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] model_decoder_layers_5_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[624] alloc1250: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv103, model_decoder_layers_5_encoder_attn_out_proj_weight5, model_decoder_layers_5_encoder_attn_out_proj_bias5, alloc1246, alloc1250) R.vm.kill_object(alloc1246) R.vm.kill_object(lv103) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias5) model_decoder_layers_5_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[631] model_decoder_layers_5_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[632] alloc1251: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1250, model_decoder_layers_5_final_layer_norm_weight5, model_decoder_layers_5_final_layer_norm_bias5, alloc1251) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias5) model_decoder_layers_5_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] model_decoder_layers_5_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[628] alloc1252: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1251, model_decoder_layers_5_fc1_weight5, model_decoder_layers_5_fc1_bias5, alloc1252) R.vm.kill_object(alloc1251) R.vm.kill_object(model_decoder_layers_5_fc1_weight5) R.vm.kill_object(model_decoder_layers_5_fc1_bias5) model_decoder_layers_5_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] model_decoder_layers_5_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[630] alloc1253: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1252, model_decoder_layers_5_fc2_weight5, model_decoder_layers_5_fc2_bias5, alloc1250, alloc1253) R.vm.kill_object(alloc1250) R.vm.kill_object(alloc1252) R.vm.kill_object(model_decoder_layers_5_fc2_weight5) R.vm.kill_object(model_decoder_layers_5_fc2_bias5) model_decoder_layers_6_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[640] model_decoder_layers_6_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[641] alloc1254: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1253, model_decoder_layers_6_self_attn_layer_norm_weight5, model_decoder_layers_6_self_attn_layer_norm_bias5, alloc1254) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias5) model_decoder_layers_6_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] model_decoder_layers_6_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[637] alloc1255: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1254, model_decoder_layers_6_self_attn_q_proj_weight5, model_decoder_layers_6_self_attn_q_proj_bias5, alloc1255) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias5) model_decoder_layers_6_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] alloc1256: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1254, model_decoder_layers_6_self_attn_k_proj_weight5, alloc1256) R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight5) model_decoder_layers_6_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] model_decoder_layers_6_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[635] alloc1257: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1254, model_decoder_layers_6_self_attn_v_proj_weight5, model_decoder_layers_6_self_attn_v_proj_bias5, alloc1257) R.vm.kill_object(alloc1254) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias5) alloc1258: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1255, alloc1256, alloc1257, alloc1258) R.vm.kill_object(alloc1255) R.vm.kill_object(alloc1256) R.vm.kill_object(alloc1257) alloc1259: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1257: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), alloc1258, alloc1259) R.vm.kill_object(alloc1258) lv110: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1259, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1259) model_decoder_layers_6_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] model_decoder_layers_6_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[639] alloc1260: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv110, model_decoder_layers_6_self_attn_out_proj_weight5, model_decoder_layers_6_self_attn_out_proj_bias5, alloc1253, alloc1260) R.vm.kill_object(alloc1253) R.vm.kill_object(lv110) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias5) model_decoder_layers_6_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[649] model_decoder_layers_6_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[650] alloc1261: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1260, model_decoder_layers_6_encoder_attn_layer_norm_weight5, model_decoder_layers_6_encoder_attn_layer_norm_bias5, alloc1261) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias5) model_decoder_layers_6_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] model_decoder_layers_6_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[646] alloc1262: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1261, model_decoder_layers_6_encoder_attn_q_proj_weight5, model_decoder_layers_6_encoder_attn_q_proj_bias5, alloc1262) R.vm.kill_object(alloc1261) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias5) lv113: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1262, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1262) alloc1263: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1261: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), lv113, alloc1263) R.vm.kill_object(lv113) lv114: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1263, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1263) model_decoder_layers_6_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] model_decoder_layers_6_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[648] alloc1264: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv114, model_decoder_layers_6_encoder_attn_out_proj_weight5, model_decoder_layers_6_encoder_attn_out_proj_bias5, alloc1260, alloc1264) R.vm.kill_object(alloc1260) R.vm.kill_object(lv114) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias5) model_decoder_layers_6_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[655] model_decoder_layers_6_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[656] alloc1265: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1264, model_decoder_layers_6_final_layer_norm_weight5, model_decoder_layers_6_final_layer_norm_bias5, alloc1265) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias5) model_decoder_layers_6_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] model_decoder_layers_6_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[652] alloc1266: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1265, model_decoder_layers_6_fc1_weight5, model_decoder_layers_6_fc1_bias5, alloc1266) R.vm.kill_object(alloc1265) R.vm.kill_object(model_decoder_layers_6_fc1_weight5) R.vm.kill_object(model_decoder_layers_6_fc1_bias5) model_decoder_layers_6_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] model_decoder_layers_6_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[654] alloc1267: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1266, model_decoder_layers_6_fc2_weight5, model_decoder_layers_6_fc2_bias5, alloc1264, alloc1267) R.vm.kill_object(alloc1264) R.vm.kill_object(alloc1266) R.vm.kill_object(model_decoder_layers_6_fc2_weight5) R.vm.kill_object(model_decoder_layers_6_fc2_bias5) model_decoder_layers_7_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[664] model_decoder_layers_7_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[665] alloc1268: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1267, model_decoder_layers_7_self_attn_layer_norm_weight5, model_decoder_layers_7_self_attn_layer_norm_bias5, alloc1268) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias5) model_decoder_layers_7_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] model_decoder_layers_7_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[661] alloc1269: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1268, model_decoder_layers_7_self_attn_q_proj_weight5, model_decoder_layers_7_self_attn_q_proj_bias5, alloc1269) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias5) model_decoder_layers_7_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] alloc1270: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1268, model_decoder_layers_7_self_attn_k_proj_weight5, alloc1270) R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight5) model_decoder_layers_7_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] model_decoder_layers_7_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[659] alloc1271: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1268, model_decoder_layers_7_self_attn_v_proj_weight5, model_decoder_layers_7_self_attn_v_proj_bias5, alloc1271) R.vm.kill_object(alloc1268) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias5) alloc1272: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1269, alloc1270, alloc1271, alloc1272) R.vm.kill_object(alloc1269) R.vm.kill_object(alloc1270) R.vm.kill_object(alloc1271) alloc1273: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1271: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), alloc1272, alloc1273) R.vm.kill_object(alloc1272) lv121: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1273, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1273) model_decoder_layers_7_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] model_decoder_layers_7_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[663] alloc1274: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv121, model_decoder_layers_7_self_attn_out_proj_weight5, model_decoder_layers_7_self_attn_out_proj_bias5, alloc1267, alloc1274) R.vm.kill_object(alloc1267) R.vm.kill_object(lv121) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias5) model_decoder_layers_7_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[673] model_decoder_layers_7_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[674] alloc1275: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1274, model_decoder_layers_7_encoder_attn_layer_norm_weight5, model_decoder_layers_7_encoder_attn_layer_norm_bias5, alloc1275) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias5) model_decoder_layers_7_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] model_decoder_layers_7_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[670] alloc1276: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1275, model_decoder_layers_7_encoder_attn_q_proj_weight5, model_decoder_layers_7_encoder_attn_q_proj_bias5, alloc1276) R.vm.kill_object(alloc1275) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias5) lv124: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1276, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1276) alloc1277: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1275: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), lv124, alloc1277) R.vm.kill_object(lv124) lv125: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1277, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1277) model_decoder_layers_7_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] model_decoder_layers_7_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[672] alloc1278: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv125, model_decoder_layers_7_encoder_attn_out_proj_weight5, model_decoder_layers_7_encoder_attn_out_proj_bias5, alloc1274, alloc1278) R.vm.kill_object(alloc1274) R.vm.kill_object(lv125) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias5) model_decoder_layers_7_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[679] model_decoder_layers_7_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[680] alloc1279: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1278, model_decoder_layers_7_final_layer_norm_weight5, model_decoder_layers_7_final_layer_norm_bias5, alloc1279) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias5) model_decoder_layers_7_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] model_decoder_layers_7_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[676] alloc1280: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1279, model_decoder_layers_7_fc1_weight5, model_decoder_layers_7_fc1_bias5, alloc1280) R.vm.kill_object(alloc1279) R.vm.kill_object(model_decoder_layers_7_fc1_weight5) R.vm.kill_object(model_decoder_layers_7_fc1_bias5) model_decoder_layers_7_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] model_decoder_layers_7_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[678] alloc1281: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1280, model_decoder_layers_7_fc2_weight5, model_decoder_layers_7_fc2_bias5, alloc1278, alloc1281) R.vm.kill_object(alloc1278) R.vm.kill_object(alloc1280) R.vm.kill_object(model_decoder_layers_7_fc2_weight5) R.vm.kill_object(model_decoder_layers_7_fc2_bias5) model_decoder_layers_8_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[688] model_decoder_layers_8_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[689] alloc1282: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1281, model_decoder_layers_8_self_attn_layer_norm_weight5, model_decoder_layers_8_self_attn_layer_norm_bias5, alloc1282) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias5) model_decoder_layers_8_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] model_decoder_layers_8_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[685] alloc1283: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1282, model_decoder_layers_8_self_attn_q_proj_weight5, model_decoder_layers_8_self_attn_q_proj_bias5, alloc1283) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias5) model_decoder_layers_8_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] alloc1284: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1282, model_decoder_layers_8_self_attn_k_proj_weight5, alloc1284) R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight5) model_decoder_layers_8_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] model_decoder_layers_8_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[683] alloc1285: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1282, model_decoder_layers_8_self_attn_v_proj_weight5, model_decoder_layers_8_self_attn_v_proj_bias5, alloc1285) R.vm.kill_object(alloc1282) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias5) alloc1286: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1283, alloc1284, alloc1285, alloc1286) R.vm.kill_object(alloc1283) R.vm.kill_object(alloc1284) R.vm.kill_object(alloc1285) alloc1287: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1285: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), alloc1286, alloc1287) R.vm.kill_object(alloc1286) lv132: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1287, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1287) model_decoder_layers_8_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] model_decoder_layers_8_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[687] alloc1288: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv132, model_decoder_layers_8_self_attn_out_proj_weight5, model_decoder_layers_8_self_attn_out_proj_bias5, alloc1281, alloc1288) R.vm.kill_object(alloc1281) R.vm.kill_object(lv132) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias5) model_decoder_layers_8_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[697] model_decoder_layers_8_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[698] alloc1289: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1288, model_decoder_layers_8_encoder_attn_layer_norm_weight5, model_decoder_layers_8_encoder_attn_layer_norm_bias5, alloc1289) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias5) model_decoder_layers_8_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] model_decoder_layers_8_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[694] alloc1290: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1289, model_decoder_layers_8_encoder_attn_q_proj_weight5, model_decoder_layers_8_encoder_attn_q_proj_bias5, alloc1290) R.vm.kill_object(alloc1289) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias5) lv135: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1290, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1290) alloc1291: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1289: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), lv135, alloc1291) R.vm.kill_object(lv135) lv136: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1291, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1291) model_decoder_layers_8_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] model_decoder_layers_8_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[696] alloc1292: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv136, model_decoder_layers_8_encoder_attn_out_proj_weight5, model_decoder_layers_8_encoder_attn_out_proj_bias5, alloc1288, alloc1292) R.vm.kill_object(alloc1288) R.vm.kill_object(lv136) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias5) model_decoder_layers_8_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[703] model_decoder_layers_8_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[704] alloc1293: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1292, model_decoder_layers_8_final_layer_norm_weight5, model_decoder_layers_8_final_layer_norm_bias5, alloc1293) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias5) model_decoder_layers_8_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] model_decoder_layers_8_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[700] alloc1294: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1293, model_decoder_layers_8_fc1_weight5, model_decoder_layers_8_fc1_bias5, alloc1294) R.vm.kill_object(alloc1293) R.vm.kill_object(model_decoder_layers_8_fc1_weight5) R.vm.kill_object(model_decoder_layers_8_fc1_bias5) model_decoder_layers_8_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] model_decoder_layers_8_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[702] alloc1295: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1294, model_decoder_layers_8_fc2_weight5, model_decoder_layers_8_fc2_bias5, alloc1292, alloc1295) R.vm.kill_object(alloc1292) R.vm.kill_object(alloc1294) R.vm.kill_object(model_decoder_layers_8_fc2_weight5) R.vm.kill_object(model_decoder_layers_8_fc2_bias5) model_decoder_layers_9_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[712] model_decoder_layers_9_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[713] alloc1296: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1295, model_decoder_layers_9_self_attn_layer_norm_weight5, model_decoder_layers_9_self_attn_layer_norm_bias5, alloc1296) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias5) model_decoder_layers_9_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] model_decoder_layers_9_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[709] alloc1297: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1296, model_decoder_layers_9_self_attn_q_proj_weight5, model_decoder_layers_9_self_attn_q_proj_bias5, alloc1297) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias5) model_decoder_layers_9_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] alloc1298: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1296, model_decoder_layers_9_self_attn_k_proj_weight5, alloc1298) R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight5) model_decoder_layers_9_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] model_decoder_layers_9_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[707] alloc1299: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1296, model_decoder_layers_9_self_attn_v_proj_weight5, model_decoder_layers_9_self_attn_v_proj_bias5, alloc1299) R.vm.kill_object(alloc1296) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias5) alloc1300: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1297, alloc1298, alloc1299, alloc1300) R.vm.kill_object(alloc1297) R.vm.kill_object(alloc1298) R.vm.kill_object(alloc1299) alloc1301: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1299: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), alloc1300, alloc1301) R.vm.kill_object(alloc1300) lv143: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1301, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1301) model_decoder_layers_9_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] model_decoder_layers_9_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[711] alloc1302: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv143, model_decoder_layers_9_self_attn_out_proj_weight5, model_decoder_layers_9_self_attn_out_proj_bias5, alloc1295, alloc1302) R.vm.kill_object(alloc1295) R.vm.kill_object(lv143) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias5) model_decoder_layers_9_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[721] model_decoder_layers_9_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[722] alloc1303: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1302, model_decoder_layers_9_encoder_attn_layer_norm_weight5, model_decoder_layers_9_encoder_attn_layer_norm_bias5, alloc1303) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias5) model_decoder_layers_9_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] model_decoder_layers_9_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[718] alloc1304: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1303, model_decoder_layers_9_encoder_attn_q_proj_weight5, model_decoder_layers_9_encoder_attn_q_proj_bias5, alloc1304) R.vm.kill_object(alloc1303) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias5) lv146: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1304, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1304) alloc1305: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1303: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), lv146, alloc1305) R.vm.kill_object(lv146) lv147: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1305, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1305) model_decoder_layers_9_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] model_decoder_layers_9_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[720] alloc1306: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv147, model_decoder_layers_9_encoder_attn_out_proj_weight5, model_decoder_layers_9_encoder_attn_out_proj_bias5, alloc1302, alloc1306) R.vm.kill_object(alloc1302) R.vm.kill_object(lv147) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias5) model_decoder_layers_9_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[727] model_decoder_layers_9_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[728] alloc1307: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1306, model_decoder_layers_9_final_layer_norm_weight5, model_decoder_layers_9_final_layer_norm_bias5, alloc1307) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias5) model_decoder_layers_9_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] model_decoder_layers_9_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[724] alloc1308: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1307, model_decoder_layers_9_fc1_weight5, model_decoder_layers_9_fc1_bias5, alloc1308) R.vm.kill_object(alloc1307) R.vm.kill_object(model_decoder_layers_9_fc1_weight5) R.vm.kill_object(model_decoder_layers_9_fc1_bias5) model_decoder_layers_9_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] model_decoder_layers_9_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[726] alloc1309: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1308, model_decoder_layers_9_fc2_weight5, model_decoder_layers_9_fc2_bias5, alloc1306, alloc1309) R.vm.kill_object(alloc1306) R.vm.kill_object(alloc1308) R.vm.kill_object(model_decoder_layers_9_fc2_weight5) R.vm.kill_object(model_decoder_layers_9_fc2_bias5) model_decoder_layers_10_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[736] model_decoder_layers_10_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[737] alloc1310: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1309, model_decoder_layers_10_self_attn_layer_norm_weight5, model_decoder_layers_10_self_attn_layer_norm_bias5, alloc1310) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias5) model_decoder_layers_10_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] model_decoder_layers_10_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[733] alloc1311: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1310, model_decoder_layers_10_self_attn_q_proj_weight5, model_decoder_layers_10_self_attn_q_proj_bias5, alloc1311) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias5) model_decoder_layers_10_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] alloc1312: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1310, model_decoder_layers_10_self_attn_k_proj_weight5, alloc1312) R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight5) model_decoder_layers_10_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] model_decoder_layers_10_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[731] alloc1313: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1310, model_decoder_layers_10_self_attn_v_proj_weight5, model_decoder_layers_10_self_attn_v_proj_bias5, alloc1313) R.vm.kill_object(alloc1310) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias5) alloc1314: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1311, alloc1312, alloc1313, alloc1314) R.vm.kill_object(alloc1311) R.vm.kill_object(alloc1312) R.vm.kill_object(alloc1313) alloc1315: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1313: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), alloc1314, alloc1315) R.vm.kill_object(alloc1314) lv154: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1315, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1315) model_decoder_layers_10_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] model_decoder_layers_10_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[735] alloc1316: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv154, model_decoder_layers_10_self_attn_out_proj_weight5, model_decoder_layers_10_self_attn_out_proj_bias5, alloc1309, alloc1316) R.vm.kill_object(alloc1309) R.vm.kill_object(lv154) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias5) model_decoder_layers_10_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[745] model_decoder_layers_10_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[746] alloc1317: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1316, model_decoder_layers_10_encoder_attn_layer_norm_weight5, model_decoder_layers_10_encoder_attn_layer_norm_bias5, alloc1317) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias5) model_decoder_layers_10_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] model_decoder_layers_10_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[742] alloc1318: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1317, model_decoder_layers_10_encoder_attn_q_proj_weight5, model_decoder_layers_10_encoder_attn_q_proj_bias5, alloc1318) R.vm.kill_object(alloc1317) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias5) lv157: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1318, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1318) alloc1319: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1317: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), lv157, alloc1319) R.vm.kill_object(lv157) lv158: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1319, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1319) model_decoder_layers_10_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] model_decoder_layers_10_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[744] alloc1320: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv158, model_decoder_layers_10_encoder_attn_out_proj_weight5, model_decoder_layers_10_encoder_attn_out_proj_bias5, alloc1316, alloc1320) R.vm.kill_object(alloc1316) R.vm.kill_object(lv158) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias5) model_decoder_layers_10_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[751] model_decoder_layers_10_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[752] alloc1321: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1320, model_decoder_layers_10_final_layer_norm_weight5, model_decoder_layers_10_final_layer_norm_bias5, alloc1321) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias5) model_decoder_layers_10_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] model_decoder_layers_10_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[748] alloc1322: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1321, model_decoder_layers_10_fc1_weight5, model_decoder_layers_10_fc1_bias5, alloc1322) R.vm.kill_object(alloc1321) R.vm.kill_object(model_decoder_layers_10_fc1_weight5) R.vm.kill_object(model_decoder_layers_10_fc1_bias5) model_decoder_layers_10_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] model_decoder_layers_10_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[750] alloc1323: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1322, model_decoder_layers_10_fc2_weight5, model_decoder_layers_10_fc2_bias5, alloc1320, alloc1323) R.vm.kill_object(alloc1320) R.vm.kill_object(alloc1322) R.vm.kill_object(model_decoder_layers_10_fc2_weight5) R.vm.kill_object(model_decoder_layers_10_fc2_bias5) model_decoder_layers_11_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[760] model_decoder_layers_11_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[761] alloc1324: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1323, model_decoder_layers_11_self_attn_layer_norm_weight5, model_decoder_layers_11_self_attn_layer_norm_bias5, alloc1324) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias5) model_decoder_layers_11_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] model_decoder_layers_11_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[757] alloc1325: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1324, model_decoder_layers_11_self_attn_q_proj_weight5, model_decoder_layers_11_self_attn_q_proj_bias5, alloc1325) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias5) model_decoder_layers_11_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] alloc1326: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1324, model_decoder_layers_11_self_attn_k_proj_weight5, alloc1326) R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight5) model_decoder_layers_11_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] model_decoder_layers_11_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[755] alloc1327: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1324, model_decoder_layers_11_self_attn_v_proj_weight5, model_decoder_layers_11_self_attn_v_proj_bias5, alloc1327) R.vm.kill_object(alloc1324) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias5) alloc1328: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1325, alloc1326, alloc1327, alloc1328) R.vm.kill_object(alloc1325) R.vm.kill_object(alloc1326) R.vm.kill_object(alloc1327) alloc1329: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1327: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), alloc1328, alloc1329) R.vm.kill_object(alloc1328) lv165: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1329, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1329) model_decoder_layers_11_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] model_decoder_layers_11_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[759] alloc1330: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv165, model_decoder_layers_11_self_attn_out_proj_weight5, model_decoder_layers_11_self_attn_out_proj_bias5, alloc1323, alloc1330) R.vm.kill_object(alloc1323) R.vm.kill_object(lv165) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias5) model_decoder_layers_11_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[769] model_decoder_layers_11_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[770] alloc1331: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1330, model_decoder_layers_11_encoder_attn_layer_norm_weight5, model_decoder_layers_11_encoder_attn_layer_norm_bias5, alloc1331) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias5) model_decoder_layers_11_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] model_decoder_layers_11_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[766] alloc1332: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1331, model_decoder_layers_11_encoder_attn_q_proj_weight5, model_decoder_layers_11_encoder_attn_q_proj_bias5, alloc1332) R.vm.kill_object(alloc1331) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias5) lv168: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1332, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1332) alloc1333: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1331: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), lv168, alloc1333) R.vm.kill_object(lv168) lv169: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1333, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1333) model_decoder_layers_11_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] model_decoder_layers_11_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[768] alloc1334: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv169, model_decoder_layers_11_encoder_attn_out_proj_weight5, model_decoder_layers_11_encoder_attn_out_proj_bias5, alloc1330, alloc1334) R.vm.kill_object(alloc1330) R.vm.kill_object(lv169) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias5) model_decoder_layers_11_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[775] model_decoder_layers_11_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[776] alloc1335: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1334, model_decoder_layers_11_final_layer_norm_weight5, model_decoder_layers_11_final_layer_norm_bias5, alloc1335) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias5) model_decoder_layers_11_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] model_decoder_layers_11_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[772] alloc1336: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1335, model_decoder_layers_11_fc1_weight5, model_decoder_layers_11_fc1_bias5, alloc1336) R.vm.kill_object(alloc1335) R.vm.kill_object(model_decoder_layers_11_fc1_weight5) R.vm.kill_object(model_decoder_layers_11_fc1_bias5) model_decoder_layers_11_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] model_decoder_layers_11_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[774] alloc1337: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1336, model_decoder_layers_11_fc2_weight5, model_decoder_layers_11_fc2_bias5, alloc1334, alloc1337) R.vm.kill_object(alloc1334) R.vm.kill_object(alloc1336) R.vm.kill_object(model_decoder_layers_11_fc2_weight5) R.vm.kill_object(model_decoder_layers_11_fc2_bias5) model_decoder_layers_12_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[784] model_decoder_layers_12_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[785] alloc1338: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1337, model_decoder_layers_12_self_attn_layer_norm_weight5, model_decoder_layers_12_self_attn_layer_norm_bias5, alloc1338) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias5) model_decoder_layers_12_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] model_decoder_layers_12_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[781] alloc1339: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1338, model_decoder_layers_12_self_attn_q_proj_weight5, model_decoder_layers_12_self_attn_q_proj_bias5, alloc1339) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias5) model_decoder_layers_12_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] alloc1340: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1338, model_decoder_layers_12_self_attn_k_proj_weight5, alloc1340) R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight5) model_decoder_layers_12_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] model_decoder_layers_12_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[779] alloc1341: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1338, model_decoder_layers_12_self_attn_v_proj_weight5, model_decoder_layers_12_self_attn_v_proj_bias5, alloc1341) R.vm.kill_object(alloc1338) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias5) alloc1342: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1339, alloc1340, alloc1341, alloc1342) R.vm.kill_object(alloc1339) R.vm.kill_object(alloc1340) R.vm.kill_object(alloc1341) alloc1343: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1341: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), alloc1342, alloc1343) R.vm.kill_object(alloc1342) lv176: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1343, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1343) model_decoder_layers_12_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] model_decoder_layers_12_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[783] alloc1344: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv176, model_decoder_layers_12_self_attn_out_proj_weight5, model_decoder_layers_12_self_attn_out_proj_bias5, alloc1337, alloc1344) R.vm.kill_object(alloc1337) R.vm.kill_object(lv176) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias5) model_decoder_layers_12_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[793] model_decoder_layers_12_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[794] alloc1345: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1344, model_decoder_layers_12_encoder_attn_layer_norm_weight5, model_decoder_layers_12_encoder_attn_layer_norm_bias5, alloc1345) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias5) model_decoder_layers_12_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] model_decoder_layers_12_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[790] alloc1346: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1345, model_decoder_layers_12_encoder_attn_q_proj_weight5, model_decoder_layers_12_encoder_attn_q_proj_bias5, alloc1346) R.vm.kill_object(alloc1345) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias5) lv179: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1346, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1346) alloc1347: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1345: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), lv179, alloc1347) R.vm.kill_object(lv179) lv180: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1347, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1347) model_decoder_layers_12_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] model_decoder_layers_12_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[792] alloc1348: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv180, model_decoder_layers_12_encoder_attn_out_proj_weight5, model_decoder_layers_12_encoder_attn_out_proj_bias5, alloc1344, alloc1348) R.vm.kill_object(alloc1344) R.vm.kill_object(lv180) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias5) model_decoder_layers_12_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[799] model_decoder_layers_12_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[800] alloc1349: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1348, model_decoder_layers_12_final_layer_norm_weight5, model_decoder_layers_12_final_layer_norm_bias5, alloc1349) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias5) model_decoder_layers_12_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] model_decoder_layers_12_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[796] alloc1350: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1349, model_decoder_layers_12_fc1_weight5, model_decoder_layers_12_fc1_bias5, alloc1350) R.vm.kill_object(alloc1349) R.vm.kill_object(model_decoder_layers_12_fc1_weight5) R.vm.kill_object(model_decoder_layers_12_fc1_bias5) model_decoder_layers_12_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] model_decoder_layers_12_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[798] alloc1351: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1350, model_decoder_layers_12_fc2_weight5, model_decoder_layers_12_fc2_bias5, alloc1348, alloc1351) R.vm.kill_object(alloc1348) R.vm.kill_object(alloc1350) R.vm.kill_object(model_decoder_layers_12_fc2_weight5) R.vm.kill_object(model_decoder_layers_12_fc2_bias5) model_decoder_layers_13_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[808] model_decoder_layers_13_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[809] alloc1352: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1351, model_decoder_layers_13_self_attn_layer_norm_weight5, model_decoder_layers_13_self_attn_layer_norm_bias5, alloc1352) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias5) model_decoder_layers_13_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] model_decoder_layers_13_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[805] alloc1353: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1352, model_decoder_layers_13_self_attn_q_proj_weight5, model_decoder_layers_13_self_attn_q_proj_bias5, alloc1353) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias5) model_decoder_layers_13_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] alloc1354: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1352, model_decoder_layers_13_self_attn_k_proj_weight5, alloc1354) R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight5) model_decoder_layers_13_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] model_decoder_layers_13_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[803] alloc1355: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1352, model_decoder_layers_13_self_attn_v_proj_weight5, model_decoder_layers_13_self_attn_v_proj_bias5, alloc1355) R.vm.kill_object(alloc1352) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias5) alloc1356: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1353, alloc1354, alloc1355, alloc1356) R.vm.kill_object(alloc1353) R.vm.kill_object(alloc1354) R.vm.kill_object(alloc1355) alloc1357: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1355: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), alloc1356, alloc1357) R.vm.kill_object(alloc1356) lv187: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1357, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1357) model_decoder_layers_13_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] model_decoder_layers_13_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[807] alloc1358: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv187, model_decoder_layers_13_self_attn_out_proj_weight5, model_decoder_layers_13_self_attn_out_proj_bias5, alloc1351, alloc1358) R.vm.kill_object(alloc1351) R.vm.kill_object(lv187) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias5) model_decoder_layers_13_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[817] model_decoder_layers_13_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[818] alloc1359: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1358, model_decoder_layers_13_encoder_attn_layer_norm_weight5, model_decoder_layers_13_encoder_attn_layer_norm_bias5, alloc1359) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias5) model_decoder_layers_13_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] model_decoder_layers_13_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[814] alloc1360: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1359, model_decoder_layers_13_encoder_attn_q_proj_weight5, model_decoder_layers_13_encoder_attn_q_proj_bias5, alloc1360) R.vm.kill_object(alloc1359) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias5) lv190: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1360, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1360) alloc1361: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1359: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), lv190, alloc1361) R.vm.kill_object(lv190) lv191: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1361, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1361) model_decoder_layers_13_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] model_decoder_layers_13_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[816] alloc1362: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv191, model_decoder_layers_13_encoder_attn_out_proj_weight5, model_decoder_layers_13_encoder_attn_out_proj_bias5, alloc1358, alloc1362) R.vm.kill_object(alloc1358) R.vm.kill_object(lv191) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias5) model_decoder_layers_13_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[823] model_decoder_layers_13_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[824] alloc1363: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1362, model_decoder_layers_13_final_layer_norm_weight5, model_decoder_layers_13_final_layer_norm_bias5, alloc1363) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias5) model_decoder_layers_13_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] model_decoder_layers_13_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[820] alloc1364: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1363, model_decoder_layers_13_fc1_weight5, model_decoder_layers_13_fc1_bias5, alloc1364) R.vm.kill_object(alloc1363) R.vm.kill_object(model_decoder_layers_13_fc1_weight5) R.vm.kill_object(model_decoder_layers_13_fc1_bias5) model_decoder_layers_13_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] model_decoder_layers_13_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[822] alloc1365: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1364, model_decoder_layers_13_fc2_weight5, model_decoder_layers_13_fc2_bias5, alloc1362, alloc1365) R.vm.kill_object(alloc1362) R.vm.kill_object(alloc1364) R.vm.kill_object(model_decoder_layers_13_fc2_weight5) R.vm.kill_object(model_decoder_layers_13_fc2_bias5) model_decoder_layers_14_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[832] model_decoder_layers_14_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[833] alloc1366: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1365, model_decoder_layers_14_self_attn_layer_norm_weight5, model_decoder_layers_14_self_attn_layer_norm_bias5, alloc1366) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias5) model_decoder_layers_14_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] model_decoder_layers_14_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[829] alloc1367: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1366, model_decoder_layers_14_self_attn_q_proj_weight5, model_decoder_layers_14_self_attn_q_proj_bias5, alloc1367) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias5) model_decoder_layers_14_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] alloc1368: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1366, model_decoder_layers_14_self_attn_k_proj_weight5, alloc1368) R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight5) model_decoder_layers_14_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] model_decoder_layers_14_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[827] alloc1369: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1366, model_decoder_layers_14_self_attn_v_proj_weight5, model_decoder_layers_14_self_attn_v_proj_bias5, alloc1369) R.vm.kill_object(alloc1366) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias5) alloc1370: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1367, alloc1368, alloc1369, alloc1370) R.vm.kill_object(alloc1367) R.vm.kill_object(alloc1368) R.vm.kill_object(alloc1369) alloc1371: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1369: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), alloc1370, alloc1371) R.vm.kill_object(alloc1370) lv198: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1371, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1371) model_decoder_layers_14_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] model_decoder_layers_14_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[831] alloc1372: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv198, model_decoder_layers_14_self_attn_out_proj_weight5, model_decoder_layers_14_self_attn_out_proj_bias5, alloc1365, alloc1372) R.vm.kill_object(alloc1365) R.vm.kill_object(lv198) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias5) model_decoder_layers_14_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[841] model_decoder_layers_14_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[842] alloc1373: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1372, model_decoder_layers_14_encoder_attn_layer_norm_weight5, model_decoder_layers_14_encoder_attn_layer_norm_bias5, alloc1373) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias5) model_decoder_layers_14_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] model_decoder_layers_14_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[838] alloc1374: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1373, model_decoder_layers_14_encoder_attn_q_proj_weight5, model_decoder_layers_14_encoder_attn_q_proj_bias5, alloc1374) R.vm.kill_object(alloc1373) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias5) lv201: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1374, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1374) alloc1375: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1373: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), lv201, alloc1375) R.vm.kill_object(lv201) lv202: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1375, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1375) model_decoder_layers_14_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] model_decoder_layers_14_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[840] alloc1376: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv202, model_decoder_layers_14_encoder_attn_out_proj_weight5, model_decoder_layers_14_encoder_attn_out_proj_bias5, alloc1372, alloc1376) R.vm.kill_object(alloc1372) R.vm.kill_object(lv202) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias5) model_decoder_layers_14_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[847] model_decoder_layers_14_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[848] alloc1377: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1376, model_decoder_layers_14_final_layer_norm_weight5, model_decoder_layers_14_final_layer_norm_bias5, alloc1377) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias5) model_decoder_layers_14_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] model_decoder_layers_14_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[844] alloc1378: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1377, model_decoder_layers_14_fc1_weight5, model_decoder_layers_14_fc1_bias5, alloc1378) R.vm.kill_object(alloc1377) R.vm.kill_object(model_decoder_layers_14_fc1_weight5) R.vm.kill_object(model_decoder_layers_14_fc1_bias5) model_decoder_layers_14_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] model_decoder_layers_14_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[846] alloc1379: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1378, model_decoder_layers_14_fc2_weight5, model_decoder_layers_14_fc2_bias5, alloc1376, alloc1379) R.vm.kill_object(alloc1376) R.vm.kill_object(alloc1378) R.vm.kill_object(model_decoder_layers_14_fc2_weight5) R.vm.kill_object(model_decoder_layers_14_fc2_bias5) model_decoder_layers_15_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[856] model_decoder_layers_15_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[857] alloc1380: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1379, model_decoder_layers_15_self_attn_layer_norm_weight5, model_decoder_layers_15_self_attn_layer_norm_bias5, alloc1380) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias5) model_decoder_layers_15_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] model_decoder_layers_15_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[853] alloc1381: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1380, model_decoder_layers_15_self_attn_q_proj_weight5, model_decoder_layers_15_self_attn_q_proj_bias5, alloc1381) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias5) model_decoder_layers_15_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] alloc1382: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1380, model_decoder_layers_15_self_attn_k_proj_weight5, alloc1382) R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight5) model_decoder_layers_15_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] model_decoder_layers_15_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[851] alloc1383: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1380, model_decoder_layers_15_self_attn_v_proj_weight5, model_decoder_layers_15_self_attn_v_proj_bias5, alloc1383) R.vm.kill_object(alloc1380) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias5) alloc1384: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1381, alloc1382, alloc1383, alloc1384) R.vm.kill_object(alloc1381) R.vm.kill_object(alloc1382) R.vm.kill_object(alloc1383) alloc1385: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1383: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), alloc1384, alloc1385) R.vm.kill_object(alloc1384) lv209: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1385, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1385) model_decoder_layers_15_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] model_decoder_layers_15_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[855] alloc1386: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv209, model_decoder_layers_15_self_attn_out_proj_weight5, model_decoder_layers_15_self_attn_out_proj_bias5, alloc1379, alloc1386) R.vm.kill_object(alloc1379) R.vm.kill_object(lv209) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias5) model_decoder_layers_15_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[865] model_decoder_layers_15_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[866] alloc1387: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1386, model_decoder_layers_15_encoder_attn_layer_norm_weight5, model_decoder_layers_15_encoder_attn_layer_norm_bias5, alloc1387) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias5) model_decoder_layers_15_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] model_decoder_layers_15_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[862] alloc1388: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1387, model_decoder_layers_15_encoder_attn_q_proj_weight5, model_decoder_layers_15_encoder_attn_q_proj_bias5, alloc1388) R.vm.kill_object(alloc1387) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias5) lv212: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1388, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1388) alloc1389: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1387: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), lv212, alloc1389) R.vm.kill_object(lv212) lv213: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1389, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1389) model_decoder_layers_15_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] model_decoder_layers_15_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[864] alloc1390: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv213, model_decoder_layers_15_encoder_attn_out_proj_weight5, model_decoder_layers_15_encoder_attn_out_proj_bias5, alloc1386, alloc1390) R.vm.kill_object(alloc1386) R.vm.kill_object(lv213) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias5) model_decoder_layers_15_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[871] model_decoder_layers_15_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[872] alloc1391: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1390, model_decoder_layers_15_final_layer_norm_weight5, model_decoder_layers_15_final_layer_norm_bias5, alloc1391) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias5) model_decoder_layers_15_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] model_decoder_layers_15_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[868] alloc1392: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1391, model_decoder_layers_15_fc1_weight5, model_decoder_layers_15_fc1_bias5, alloc1392) R.vm.kill_object(alloc1391) R.vm.kill_object(model_decoder_layers_15_fc1_weight5) R.vm.kill_object(model_decoder_layers_15_fc1_bias5) model_decoder_layers_15_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] model_decoder_layers_15_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[870] alloc1393: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1392, model_decoder_layers_15_fc2_weight5, model_decoder_layers_15_fc2_bias5, alloc1390, alloc1393) R.vm.kill_object(alloc1390) R.vm.kill_object(alloc1392) R.vm.kill_object(model_decoder_layers_15_fc2_weight5) R.vm.kill_object(model_decoder_layers_15_fc2_bias5) model_decoder_layers_16_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[880] model_decoder_layers_16_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[881] alloc1394: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1393, model_decoder_layers_16_self_attn_layer_norm_weight5, model_decoder_layers_16_self_attn_layer_norm_bias5, alloc1394) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias5) model_decoder_layers_16_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] model_decoder_layers_16_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[877] alloc1395: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1394, model_decoder_layers_16_self_attn_q_proj_weight5, model_decoder_layers_16_self_attn_q_proj_bias5, alloc1395) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias5) model_decoder_layers_16_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] alloc1396: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1394, model_decoder_layers_16_self_attn_k_proj_weight5, alloc1396) R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight5) model_decoder_layers_16_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] model_decoder_layers_16_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[875] alloc1397: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1394, model_decoder_layers_16_self_attn_v_proj_weight5, model_decoder_layers_16_self_attn_v_proj_bias5, alloc1397) R.vm.kill_object(alloc1394) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias5) alloc1398: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1395, alloc1396, alloc1397, alloc1398) R.vm.kill_object(alloc1395) R.vm.kill_object(alloc1396) R.vm.kill_object(alloc1397) alloc1399: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1397: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), alloc1398, alloc1399) R.vm.kill_object(alloc1398) lv220: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1399, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1399) model_decoder_layers_16_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] model_decoder_layers_16_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[879] alloc1400: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv220, model_decoder_layers_16_self_attn_out_proj_weight5, model_decoder_layers_16_self_attn_out_proj_bias5, alloc1393, alloc1400) R.vm.kill_object(alloc1393) R.vm.kill_object(lv220) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias5) model_decoder_layers_16_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[889] model_decoder_layers_16_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[890] alloc1401: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1400, model_decoder_layers_16_encoder_attn_layer_norm_weight5, model_decoder_layers_16_encoder_attn_layer_norm_bias5, alloc1401) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias5) model_decoder_layers_16_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] model_decoder_layers_16_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[886] alloc1402: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1401, model_decoder_layers_16_encoder_attn_q_proj_weight5, model_decoder_layers_16_encoder_attn_q_proj_bias5, alloc1402) R.vm.kill_object(alloc1401) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias5) lv223: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1402, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1402) alloc1403: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1401: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), lv223, alloc1403) R.vm.kill_object(lv223) lv224: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1403, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1403) model_decoder_layers_16_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] model_decoder_layers_16_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[888] alloc1404: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv224, model_decoder_layers_16_encoder_attn_out_proj_weight5, model_decoder_layers_16_encoder_attn_out_proj_bias5, alloc1400, alloc1404) R.vm.kill_object(alloc1400) R.vm.kill_object(lv224) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias5) model_decoder_layers_16_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[895] model_decoder_layers_16_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[896] alloc1405: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1404, model_decoder_layers_16_final_layer_norm_weight5, model_decoder_layers_16_final_layer_norm_bias5, alloc1405) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias5) model_decoder_layers_16_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] model_decoder_layers_16_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[892] alloc1406: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1405, model_decoder_layers_16_fc1_weight5, model_decoder_layers_16_fc1_bias5, alloc1406) R.vm.kill_object(alloc1405) R.vm.kill_object(model_decoder_layers_16_fc1_weight5) R.vm.kill_object(model_decoder_layers_16_fc1_bias5) model_decoder_layers_16_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] model_decoder_layers_16_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[894] alloc1407: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1406, model_decoder_layers_16_fc2_weight5, model_decoder_layers_16_fc2_bias5, alloc1404, alloc1407) R.vm.kill_object(alloc1404) R.vm.kill_object(alloc1406) R.vm.kill_object(model_decoder_layers_16_fc2_weight5) R.vm.kill_object(model_decoder_layers_16_fc2_bias5) model_decoder_layers_17_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[904] model_decoder_layers_17_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[905] alloc1408: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1407, model_decoder_layers_17_self_attn_layer_norm_weight5, model_decoder_layers_17_self_attn_layer_norm_bias5, alloc1408) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias5) model_decoder_layers_17_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] model_decoder_layers_17_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[901] alloc1409: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1408, model_decoder_layers_17_self_attn_q_proj_weight5, model_decoder_layers_17_self_attn_q_proj_bias5, alloc1409) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias5) model_decoder_layers_17_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] alloc1410: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1408, model_decoder_layers_17_self_attn_k_proj_weight5, alloc1410) R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight5) model_decoder_layers_17_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] model_decoder_layers_17_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[899] alloc1411: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1408, model_decoder_layers_17_self_attn_v_proj_weight5, model_decoder_layers_17_self_attn_v_proj_bias5, alloc1411) R.vm.kill_object(alloc1408) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias5) alloc1412: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1409, alloc1410, alloc1411, alloc1412) R.vm.kill_object(alloc1409) R.vm.kill_object(alloc1410) R.vm.kill_object(alloc1411) alloc1413: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1411: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), alloc1412, alloc1413) R.vm.kill_object(alloc1412) lv231: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1413, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1413) model_decoder_layers_17_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] model_decoder_layers_17_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[903] alloc1414: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv231, model_decoder_layers_17_self_attn_out_proj_weight5, model_decoder_layers_17_self_attn_out_proj_bias5, alloc1407, alloc1414) R.vm.kill_object(alloc1407) R.vm.kill_object(lv231) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias5) model_decoder_layers_17_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[913] model_decoder_layers_17_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[914] alloc1415: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1414, model_decoder_layers_17_encoder_attn_layer_norm_weight5, model_decoder_layers_17_encoder_attn_layer_norm_bias5, alloc1415) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias5) model_decoder_layers_17_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] model_decoder_layers_17_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[910] alloc1416: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1415, model_decoder_layers_17_encoder_attn_q_proj_weight5, model_decoder_layers_17_encoder_attn_q_proj_bias5, alloc1416) R.vm.kill_object(alloc1415) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias5) lv234: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1416, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1416) alloc1417: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1415: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), lv234, alloc1417) R.vm.kill_object(lv234) lv235: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1417, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1417) model_decoder_layers_17_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] model_decoder_layers_17_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[912] alloc1418: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv235, model_decoder_layers_17_encoder_attn_out_proj_weight5, model_decoder_layers_17_encoder_attn_out_proj_bias5, alloc1414, alloc1418) R.vm.kill_object(alloc1414) R.vm.kill_object(lv235) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias5) model_decoder_layers_17_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[919] model_decoder_layers_17_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[920] alloc1419: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1418, model_decoder_layers_17_final_layer_norm_weight5, model_decoder_layers_17_final_layer_norm_bias5, alloc1419) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias5) model_decoder_layers_17_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] model_decoder_layers_17_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[916] alloc1420: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1419, model_decoder_layers_17_fc1_weight5, model_decoder_layers_17_fc1_bias5, alloc1420) R.vm.kill_object(alloc1419) R.vm.kill_object(model_decoder_layers_17_fc1_weight5) R.vm.kill_object(model_decoder_layers_17_fc1_bias5) model_decoder_layers_17_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] model_decoder_layers_17_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[918] alloc1421: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1420, model_decoder_layers_17_fc2_weight5, model_decoder_layers_17_fc2_bias5, alloc1418, alloc1421) R.vm.kill_object(alloc1418) R.vm.kill_object(alloc1420) R.vm.kill_object(model_decoder_layers_17_fc2_weight5) R.vm.kill_object(model_decoder_layers_17_fc2_bias5) model_decoder_layers_18_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[928] model_decoder_layers_18_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[929] alloc1422: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1421, model_decoder_layers_18_self_attn_layer_norm_weight5, model_decoder_layers_18_self_attn_layer_norm_bias5, alloc1422) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias5) model_decoder_layers_18_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] model_decoder_layers_18_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[925] alloc1423: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1422, model_decoder_layers_18_self_attn_q_proj_weight5, model_decoder_layers_18_self_attn_q_proj_bias5, alloc1423) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias5) model_decoder_layers_18_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] alloc1424: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1422, model_decoder_layers_18_self_attn_k_proj_weight5, alloc1424) R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight5) model_decoder_layers_18_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] model_decoder_layers_18_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[923] alloc1425: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1422, model_decoder_layers_18_self_attn_v_proj_weight5, model_decoder_layers_18_self_attn_v_proj_bias5, alloc1425) R.vm.kill_object(alloc1422) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias5) alloc1426: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1423, alloc1424, alloc1425, alloc1426) R.vm.kill_object(alloc1423) R.vm.kill_object(alloc1424) R.vm.kill_object(alloc1425) alloc1427: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1425: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), alloc1426, alloc1427) R.vm.kill_object(alloc1426) lv242: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1427, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1427) model_decoder_layers_18_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] model_decoder_layers_18_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[927] alloc1428: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv242, model_decoder_layers_18_self_attn_out_proj_weight5, model_decoder_layers_18_self_attn_out_proj_bias5, alloc1421, alloc1428) R.vm.kill_object(alloc1421) R.vm.kill_object(lv242) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias5) model_decoder_layers_18_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[937] model_decoder_layers_18_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[938] alloc1429: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1428, model_decoder_layers_18_encoder_attn_layer_norm_weight5, model_decoder_layers_18_encoder_attn_layer_norm_bias5, alloc1429) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias5) model_decoder_layers_18_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] model_decoder_layers_18_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[934] alloc1430: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1429, model_decoder_layers_18_encoder_attn_q_proj_weight5, model_decoder_layers_18_encoder_attn_q_proj_bias5, alloc1430) R.vm.kill_object(alloc1429) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias5) lv245: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1430, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1430) alloc1431: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1429: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), lv245, alloc1431) R.vm.kill_object(lv245) lv246: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1431, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1431) model_decoder_layers_18_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] model_decoder_layers_18_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[936] alloc1432: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv246, model_decoder_layers_18_encoder_attn_out_proj_weight5, model_decoder_layers_18_encoder_attn_out_proj_bias5, alloc1428, alloc1432) R.vm.kill_object(alloc1428) R.vm.kill_object(lv246) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias5) model_decoder_layers_18_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[943] model_decoder_layers_18_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[944] alloc1433: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1432, model_decoder_layers_18_final_layer_norm_weight5, model_decoder_layers_18_final_layer_norm_bias5, alloc1433) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias5) model_decoder_layers_18_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] model_decoder_layers_18_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[940] alloc1434: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1433, model_decoder_layers_18_fc1_weight5, model_decoder_layers_18_fc1_bias5, alloc1434) R.vm.kill_object(alloc1433) R.vm.kill_object(model_decoder_layers_18_fc1_weight5) R.vm.kill_object(model_decoder_layers_18_fc1_bias5) model_decoder_layers_18_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] model_decoder_layers_18_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[942] alloc1435: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1434, model_decoder_layers_18_fc2_weight5, model_decoder_layers_18_fc2_bias5, alloc1432, alloc1435) R.vm.kill_object(alloc1432) R.vm.kill_object(alloc1434) R.vm.kill_object(model_decoder_layers_18_fc2_weight5) R.vm.kill_object(model_decoder_layers_18_fc2_bias5) model_decoder_layers_19_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[952] model_decoder_layers_19_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[953] alloc1436: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1435, model_decoder_layers_19_self_attn_layer_norm_weight5, model_decoder_layers_19_self_attn_layer_norm_bias5, alloc1436) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias5) model_decoder_layers_19_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] model_decoder_layers_19_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[949] alloc1437: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1436, model_decoder_layers_19_self_attn_q_proj_weight5, model_decoder_layers_19_self_attn_q_proj_bias5, alloc1437) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias5) model_decoder_layers_19_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] alloc1438: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1436, model_decoder_layers_19_self_attn_k_proj_weight5, alloc1438) R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight5) model_decoder_layers_19_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] model_decoder_layers_19_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[947] alloc1439: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1436, model_decoder_layers_19_self_attn_v_proj_weight5, model_decoder_layers_19_self_attn_v_proj_bias5, alloc1439) R.vm.kill_object(alloc1436) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias5) alloc1440: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1437, alloc1438, alloc1439, alloc1440) R.vm.kill_object(alloc1437) R.vm.kill_object(alloc1438) R.vm.kill_object(alloc1439) alloc1441: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1439: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), alloc1440, alloc1441) R.vm.kill_object(alloc1440) lv253: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1441, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1441) model_decoder_layers_19_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] model_decoder_layers_19_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[951] alloc1442: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv253, model_decoder_layers_19_self_attn_out_proj_weight5, model_decoder_layers_19_self_attn_out_proj_bias5, alloc1435, alloc1442) R.vm.kill_object(alloc1435) R.vm.kill_object(lv253) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias5) model_decoder_layers_19_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[961] model_decoder_layers_19_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[962] alloc1443: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1442, model_decoder_layers_19_encoder_attn_layer_norm_weight5, model_decoder_layers_19_encoder_attn_layer_norm_bias5, alloc1443) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias5) model_decoder_layers_19_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] model_decoder_layers_19_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[958] alloc1444: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1443, model_decoder_layers_19_encoder_attn_q_proj_weight5, model_decoder_layers_19_encoder_attn_q_proj_bias5, alloc1444) R.vm.kill_object(alloc1443) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias5) lv256: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1444, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1444) alloc1445: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1443: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), lv256, alloc1445) R.vm.kill_object(lv256) lv257: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1445, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1445) model_decoder_layers_19_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] model_decoder_layers_19_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[960] alloc1446: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv257, model_decoder_layers_19_encoder_attn_out_proj_weight5, model_decoder_layers_19_encoder_attn_out_proj_bias5, alloc1442, alloc1446) R.vm.kill_object(alloc1442) R.vm.kill_object(lv257) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias5) model_decoder_layers_19_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[967] model_decoder_layers_19_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[968] alloc1447: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1446, model_decoder_layers_19_final_layer_norm_weight5, model_decoder_layers_19_final_layer_norm_bias5, alloc1447) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias5) model_decoder_layers_19_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] model_decoder_layers_19_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[964] alloc1448: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1447, model_decoder_layers_19_fc1_weight5, model_decoder_layers_19_fc1_bias5, alloc1448) R.vm.kill_object(alloc1447) R.vm.kill_object(model_decoder_layers_19_fc1_weight5) R.vm.kill_object(model_decoder_layers_19_fc1_bias5) model_decoder_layers_19_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] model_decoder_layers_19_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[966] alloc1449: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1448, model_decoder_layers_19_fc2_weight5, model_decoder_layers_19_fc2_bias5, alloc1446, alloc1449) R.vm.kill_object(alloc1446) R.vm.kill_object(alloc1448) R.vm.kill_object(model_decoder_layers_19_fc2_weight5) R.vm.kill_object(model_decoder_layers_19_fc2_bias5) model_decoder_layers_20_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[976] model_decoder_layers_20_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[977] alloc1450: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1449, model_decoder_layers_20_self_attn_layer_norm_weight5, model_decoder_layers_20_self_attn_layer_norm_bias5, alloc1450) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias5) model_decoder_layers_20_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] model_decoder_layers_20_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[973] alloc1451: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1450, model_decoder_layers_20_self_attn_q_proj_weight5, model_decoder_layers_20_self_attn_q_proj_bias5, alloc1451) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias5) model_decoder_layers_20_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] alloc1452: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1450, model_decoder_layers_20_self_attn_k_proj_weight5, alloc1452) R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight5) model_decoder_layers_20_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] model_decoder_layers_20_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[971] alloc1453: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1450, model_decoder_layers_20_self_attn_v_proj_weight5, model_decoder_layers_20_self_attn_v_proj_bias5, alloc1453) R.vm.kill_object(alloc1450) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias5) alloc1454: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1451, alloc1452, alloc1453, alloc1454) R.vm.kill_object(alloc1451) R.vm.kill_object(alloc1452) R.vm.kill_object(alloc1453) alloc1455: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1453: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), alloc1454, alloc1455) R.vm.kill_object(alloc1454) lv264_1: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1455, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1455) model_decoder_layers_20_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] model_decoder_layers_20_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[975] alloc1456: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv264_1, model_decoder_layers_20_self_attn_out_proj_weight5, model_decoder_layers_20_self_attn_out_proj_bias5, alloc1449, alloc1456) R.vm.kill_object(alloc1449) R.vm.kill_object(lv264_1) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias5) model_decoder_layers_20_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[985] model_decoder_layers_20_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[986] alloc1457: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1456, model_decoder_layers_20_encoder_attn_layer_norm_weight5, model_decoder_layers_20_encoder_attn_layer_norm_bias5, alloc1457) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias5) model_decoder_layers_20_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] model_decoder_layers_20_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[982] alloc1458: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1457, model_decoder_layers_20_encoder_attn_q_proj_weight5, model_decoder_layers_20_encoder_attn_q_proj_bias5, alloc1458) R.vm.kill_object(alloc1457) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias5) lv267: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1458, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1458) alloc1459: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1457: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), lv267, alloc1459) R.vm.kill_object(lv267) lv268: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1459, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1459) model_decoder_layers_20_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] model_decoder_layers_20_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[984] alloc1460: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv268, model_decoder_layers_20_encoder_attn_out_proj_weight5, model_decoder_layers_20_encoder_attn_out_proj_bias5, alloc1456, alloc1460) R.vm.kill_object(alloc1456) R.vm.kill_object(lv268) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias5) model_decoder_layers_20_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[991] model_decoder_layers_20_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[992] alloc1461: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1460, model_decoder_layers_20_final_layer_norm_weight5, model_decoder_layers_20_final_layer_norm_bias5, alloc1461) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias5) model_decoder_layers_20_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] model_decoder_layers_20_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[988] alloc1462: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1461, model_decoder_layers_20_fc1_weight5, model_decoder_layers_20_fc1_bias5, alloc1462) R.vm.kill_object(alloc1461) R.vm.kill_object(model_decoder_layers_20_fc1_weight5) R.vm.kill_object(model_decoder_layers_20_fc1_bias5) model_decoder_layers_20_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] model_decoder_layers_20_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[990] alloc1463: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1462, model_decoder_layers_20_fc2_weight5, model_decoder_layers_20_fc2_bias5, alloc1460, alloc1463) R.vm.kill_object(alloc1460) R.vm.kill_object(alloc1462) R.vm.kill_object(model_decoder_layers_20_fc2_weight5) R.vm.kill_object(model_decoder_layers_20_fc2_bias5) model_decoder_layers_21_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1000] model_decoder_layers_21_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1001] alloc1464: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1463, model_decoder_layers_21_self_attn_layer_norm_weight5, model_decoder_layers_21_self_attn_layer_norm_bias5, alloc1464) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias5) model_decoder_layers_21_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] model_decoder_layers_21_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[997] alloc1465: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1464, model_decoder_layers_21_self_attn_q_proj_weight5, model_decoder_layers_21_self_attn_q_proj_bias5, alloc1465) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias5) model_decoder_layers_21_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] alloc1466: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1464, model_decoder_layers_21_self_attn_k_proj_weight5, alloc1466) R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight5) model_decoder_layers_21_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] model_decoder_layers_21_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[995] alloc1467: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1464, model_decoder_layers_21_self_attn_v_proj_weight5, model_decoder_layers_21_self_attn_v_proj_bias5, alloc1467) R.vm.kill_object(alloc1464) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias5) alloc1468: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1465, alloc1466, alloc1467, alloc1468) R.vm.kill_object(alloc1465) R.vm.kill_object(alloc1466) R.vm.kill_object(alloc1467) alloc1469: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1467: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), alloc1468, alloc1469) R.vm.kill_object(alloc1468) lv275: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1469, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1469) model_decoder_layers_21_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] model_decoder_layers_21_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[999] alloc1470: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv275, model_decoder_layers_21_self_attn_out_proj_weight5, model_decoder_layers_21_self_attn_out_proj_bias5, alloc1463, alloc1470) R.vm.kill_object(alloc1463) R.vm.kill_object(lv275) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias5) model_decoder_layers_21_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1009] model_decoder_layers_21_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1010] alloc1471: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1470, model_decoder_layers_21_encoder_attn_layer_norm_weight5, model_decoder_layers_21_encoder_attn_layer_norm_bias5, alloc1471) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias5) model_decoder_layers_21_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] model_decoder_layers_21_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1006] alloc1472: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1471, model_decoder_layers_21_encoder_attn_q_proj_weight5, model_decoder_layers_21_encoder_attn_q_proj_bias5, alloc1472) R.vm.kill_object(alloc1471) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias5) lv278: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1472, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1472) alloc1473: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1471: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), lv278, alloc1473) R.vm.kill_object(lv278) lv279: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1473, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1473) model_decoder_layers_21_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] model_decoder_layers_21_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1008] alloc1474: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv279, model_decoder_layers_21_encoder_attn_out_proj_weight5, model_decoder_layers_21_encoder_attn_out_proj_bias5, alloc1470, alloc1474) R.vm.kill_object(alloc1470) R.vm.kill_object(lv279) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias5) model_decoder_layers_21_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1015] model_decoder_layers_21_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1016] alloc1475: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1474, model_decoder_layers_21_final_layer_norm_weight5, model_decoder_layers_21_final_layer_norm_bias5, alloc1475) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias5) model_decoder_layers_21_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] model_decoder_layers_21_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1012] alloc1476: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1475, model_decoder_layers_21_fc1_weight5, model_decoder_layers_21_fc1_bias5, alloc1476) R.vm.kill_object(alloc1475) R.vm.kill_object(model_decoder_layers_21_fc1_weight5) R.vm.kill_object(model_decoder_layers_21_fc1_bias5) model_decoder_layers_21_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] model_decoder_layers_21_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1014] alloc1477: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1476, model_decoder_layers_21_fc2_weight5, model_decoder_layers_21_fc2_bias5, alloc1474, alloc1477) R.vm.kill_object(alloc1474) R.vm.kill_object(alloc1476) R.vm.kill_object(model_decoder_layers_21_fc2_weight5) R.vm.kill_object(model_decoder_layers_21_fc2_bias5) model_decoder_layers_22_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1024] model_decoder_layers_22_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1025] alloc1478: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1477, model_decoder_layers_22_self_attn_layer_norm_weight5, model_decoder_layers_22_self_attn_layer_norm_bias5, alloc1478) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias5) model_decoder_layers_22_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] model_decoder_layers_22_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1021] alloc1479: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1478, model_decoder_layers_22_self_attn_q_proj_weight5, model_decoder_layers_22_self_attn_q_proj_bias5, alloc1479) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias5) model_decoder_layers_22_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] alloc1480: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1478, model_decoder_layers_22_self_attn_k_proj_weight5, alloc1480) R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight5) model_decoder_layers_22_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] model_decoder_layers_22_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1019] alloc1481: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1478, model_decoder_layers_22_self_attn_v_proj_weight5, model_decoder_layers_22_self_attn_v_proj_bias5, alloc1481) R.vm.kill_object(alloc1478) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias5) alloc1482: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1479, alloc1480, alloc1481, alloc1482) R.vm.kill_object(alloc1479) R.vm.kill_object(alloc1480) R.vm.kill_object(alloc1481) alloc1483: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1481: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), alloc1482, alloc1483) R.vm.kill_object(alloc1482) lv286: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1483, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1483) model_decoder_layers_22_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] model_decoder_layers_22_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1023] alloc1484: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv286, model_decoder_layers_22_self_attn_out_proj_weight5, model_decoder_layers_22_self_attn_out_proj_bias5, alloc1477, alloc1484) R.vm.kill_object(alloc1477) R.vm.kill_object(lv286) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias5) model_decoder_layers_22_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1033] model_decoder_layers_22_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1034] alloc1485: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1484, model_decoder_layers_22_encoder_attn_layer_norm_weight5, model_decoder_layers_22_encoder_attn_layer_norm_bias5, alloc1485) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias5) model_decoder_layers_22_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] model_decoder_layers_22_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1030] alloc1486: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1485, model_decoder_layers_22_encoder_attn_q_proj_weight5, model_decoder_layers_22_encoder_attn_q_proj_bias5, alloc1486) R.vm.kill_object(alloc1485) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias5) lv289: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1486, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1486) alloc1487: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1485: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), lv289, alloc1487) R.vm.kill_object(lv289) lv290: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1487, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1487) model_decoder_layers_22_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] model_decoder_layers_22_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1032] alloc1488: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv290, model_decoder_layers_22_encoder_attn_out_proj_weight5, model_decoder_layers_22_encoder_attn_out_proj_bias5, alloc1484, alloc1488) R.vm.kill_object(alloc1484) R.vm.kill_object(lv290) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias5) model_decoder_layers_22_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1039] model_decoder_layers_22_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1040] alloc1489: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1488, model_decoder_layers_22_final_layer_norm_weight5, model_decoder_layers_22_final_layer_norm_bias5, alloc1489) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias5) model_decoder_layers_22_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] model_decoder_layers_22_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1036] alloc1490: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1489, model_decoder_layers_22_fc1_weight5, model_decoder_layers_22_fc1_bias5, alloc1490) R.vm.kill_object(alloc1489) R.vm.kill_object(model_decoder_layers_22_fc1_weight5) R.vm.kill_object(model_decoder_layers_22_fc1_bias5) model_decoder_layers_22_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] model_decoder_layers_22_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1038] alloc1491: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1490, model_decoder_layers_22_fc2_weight5, model_decoder_layers_22_fc2_bias5, alloc1488, alloc1491) R.vm.kill_object(alloc1488) R.vm.kill_object(alloc1490) R.vm.kill_object(model_decoder_layers_22_fc2_weight5) R.vm.kill_object(model_decoder_layers_22_fc2_bias5) model_decoder_layers_23_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1048] model_decoder_layers_23_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1049] alloc1492: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1491, model_decoder_layers_23_self_attn_layer_norm_weight5, model_decoder_layers_23_self_attn_layer_norm_bias5, alloc1492) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias5) model_decoder_layers_23_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] model_decoder_layers_23_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1045] alloc1493: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1492, model_decoder_layers_23_self_attn_q_proj_weight5, model_decoder_layers_23_self_attn_q_proj_bias5, alloc1493) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias5) model_decoder_layers_23_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] alloc1494: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1492, model_decoder_layers_23_self_attn_k_proj_weight5, alloc1494) R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight5) model_decoder_layers_23_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] model_decoder_layers_23_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1043] alloc1495: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1492, model_decoder_layers_23_self_attn_v_proj_weight5, model_decoder_layers_23_self_attn_v_proj_bias5, alloc1495) R.vm.kill_object(alloc1492) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias5) alloc1496: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1493, alloc1494, alloc1495, alloc1496) R.vm.kill_object(alloc1493) R.vm.kill_object(alloc1494) R.vm.kill_object(alloc1495) alloc1497: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1495: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), alloc1496, alloc1497) R.vm.kill_object(alloc1496) lv297: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1497, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1497) model_decoder_layers_23_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] model_decoder_layers_23_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1047] alloc1498: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv297, model_decoder_layers_23_self_attn_out_proj_weight5, model_decoder_layers_23_self_attn_out_proj_bias5, alloc1491, alloc1498) R.vm.kill_object(alloc1491) R.vm.kill_object(lv297) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias5) model_decoder_layers_23_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1057] model_decoder_layers_23_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1058] alloc1499: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1498, model_decoder_layers_23_encoder_attn_layer_norm_weight5, model_decoder_layers_23_encoder_attn_layer_norm_bias5, alloc1499) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias5) model_decoder_layers_23_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] model_decoder_layers_23_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1054] alloc1500: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1499, model_decoder_layers_23_encoder_attn_q_proj_weight5, model_decoder_layers_23_encoder_attn_q_proj_bias5, alloc1500) R.vm.kill_object(alloc1499) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias5) lv300: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1500, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1500) alloc1501: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1499: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), lv300, alloc1501) R.vm.kill_object(lv300) lv301: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1501, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1501) model_decoder_layers_23_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] model_decoder_layers_23_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1056] alloc1502: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv301, model_decoder_layers_23_encoder_attn_out_proj_weight5, model_decoder_layers_23_encoder_attn_out_proj_bias5, alloc1498, alloc1502) R.vm.kill_object(alloc1498) R.vm.kill_object(lv301) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias5) model_decoder_layers_23_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1063] model_decoder_layers_23_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1064] alloc1503: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1502, model_decoder_layers_23_final_layer_norm_weight5, model_decoder_layers_23_final_layer_norm_bias5, alloc1503) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias5) model_decoder_layers_23_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] model_decoder_layers_23_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1060] alloc1504: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1503, model_decoder_layers_23_fc1_weight5, model_decoder_layers_23_fc1_bias5, alloc1504) R.vm.kill_object(alloc1503) R.vm.kill_object(model_decoder_layers_23_fc1_weight5) R.vm.kill_object(model_decoder_layers_23_fc1_bias5) model_decoder_layers_23_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] model_decoder_layers_23_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1062] alloc1505: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1504, model_decoder_layers_23_fc2_weight5, model_decoder_layers_23_fc2_bias5, alloc1502, alloc1505) R.vm.kill_object(alloc1502) R.vm.kill_object(alloc1504) R.vm.kill_object(model_decoder_layers_23_fc2_weight5) R.vm.kill_object(model_decoder_layers_23_fc2_bias5) model_decoder_layers_24_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1072] model_decoder_layers_24_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1073] alloc1506: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1505, model_decoder_layers_24_self_attn_layer_norm_weight5, model_decoder_layers_24_self_attn_layer_norm_bias5, alloc1506) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias5) model_decoder_layers_24_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] model_decoder_layers_24_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1069] alloc1507: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1506, model_decoder_layers_24_self_attn_q_proj_weight5, model_decoder_layers_24_self_attn_q_proj_bias5, alloc1507) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias5) model_decoder_layers_24_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] alloc1508: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1506, model_decoder_layers_24_self_attn_k_proj_weight5, alloc1508) R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight5) model_decoder_layers_24_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] model_decoder_layers_24_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1067] alloc1509: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1506, model_decoder_layers_24_self_attn_v_proj_weight5, model_decoder_layers_24_self_attn_v_proj_bias5, alloc1509) R.vm.kill_object(alloc1506) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias5) alloc1510: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1507, alloc1508, alloc1509, alloc1510) R.vm.kill_object(alloc1507) R.vm.kill_object(alloc1508) R.vm.kill_object(alloc1509) alloc1511: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1509: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), alloc1510, alloc1511) R.vm.kill_object(alloc1510) lv308: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1511, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1511) model_decoder_layers_24_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] model_decoder_layers_24_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1071] alloc1512: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv308, model_decoder_layers_24_self_attn_out_proj_weight5, model_decoder_layers_24_self_attn_out_proj_bias5, alloc1505, alloc1512) R.vm.kill_object(alloc1505) R.vm.kill_object(lv308) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias5) model_decoder_layers_24_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1081] model_decoder_layers_24_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1082] alloc1513: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1512, model_decoder_layers_24_encoder_attn_layer_norm_weight5, model_decoder_layers_24_encoder_attn_layer_norm_bias5, alloc1513) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias5) model_decoder_layers_24_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] model_decoder_layers_24_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1078] alloc1514: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1513, model_decoder_layers_24_encoder_attn_q_proj_weight5, model_decoder_layers_24_encoder_attn_q_proj_bias5, alloc1514) R.vm.kill_object(alloc1513) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias5) lv311: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1514, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1514) alloc1515: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1513: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), lv311, alloc1515) R.vm.kill_object(lv311) lv312: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1515, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1515) model_decoder_layers_24_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] model_decoder_layers_24_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1080] alloc1516: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv312, model_decoder_layers_24_encoder_attn_out_proj_weight5, model_decoder_layers_24_encoder_attn_out_proj_bias5, alloc1512, alloc1516) R.vm.kill_object(alloc1512) R.vm.kill_object(lv312) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias5) model_decoder_layers_24_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1087] model_decoder_layers_24_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1088] alloc1517: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1516, model_decoder_layers_24_final_layer_norm_weight5, model_decoder_layers_24_final_layer_norm_bias5, alloc1517) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias5) model_decoder_layers_24_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] model_decoder_layers_24_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1084] alloc1518: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1517, model_decoder_layers_24_fc1_weight5, model_decoder_layers_24_fc1_bias5, alloc1518) R.vm.kill_object(alloc1517) R.vm.kill_object(model_decoder_layers_24_fc1_weight5) R.vm.kill_object(model_decoder_layers_24_fc1_bias5) model_decoder_layers_24_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] model_decoder_layers_24_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1086] alloc1519: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1518, model_decoder_layers_24_fc2_weight5, model_decoder_layers_24_fc2_bias5, alloc1516, alloc1519) R.vm.kill_object(alloc1516) R.vm.kill_object(alloc1518) R.vm.kill_object(model_decoder_layers_24_fc2_weight5) R.vm.kill_object(model_decoder_layers_24_fc2_bias5) model_decoder_layers_25_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1096] model_decoder_layers_25_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1097] alloc1520: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1519, model_decoder_layers_25_self_attn_layer_norm_weight5, model_decoder_layers_25_self_attn_layer_norm_bias5, alloc1520) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias5) model_decoder_layers_25_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] model_decoder_layers_25_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1093] alloc1521: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1520, model_decoder_layers_25_self_attn_q_proj_weight5, model_decoder_layers_25_self_attn_q_proj_bias5, alloc1521) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias5) model_decoder_layers_25_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] alloc1522: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1520, model_decoder_layers_25_self_attn_k_proj_weight5, alloc1522) R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight5) model_decoder_layers_25_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] model_decoder_layers_25_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1091] alloc1523: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1520, model_decoder_layers_25_self_attn_v_proj_weight5, model_decoder_layers_25_self_attn_v_proj_bias5, alloc1523) R.vm.kill_object(alloc1520) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias5) alloc1524: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1521, alloc1522, alloc1523, alloc1524) R.vm.kill_object(alloc1521) R.vm.kill_object(alloc1522) R.vm.kill_object(alloc1523) alloc1525: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1523: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), alloc1524, alloc1525) R.vm.kill_object(alloc1524) lv319: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1525, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1525) model_decoder_layers_25_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] model_decoder_layers_25_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1095] alloc1526: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv319, model_decoder_layers_25_self_attn_out_proj_weight5, model_decoder_layers_25_self_attn_out_proj_bias5, alloc1519, alloc1526) R.vm.kill_object(alloc1519) R.vm.kill_object(lv319) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias5) model_decoder_layers_25_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1105] model_decoder_layers_25_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1106] alloc1527: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1526, model_decoder_layers_25_encoder_attn_layer_norm_weight5, model_decoder_layers_25_encoder_attn_layer_norm_bias5, alloc1527) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias5) model_decoder_layers_25_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] model_decoder_layers_25_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1102] alloc1528: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1527, model_decoder_layers_25_encoder_attn_q_proj_weight5, model_decoder_layers_25_encoder_attn_q_proj_bias5, alloc1528) R.vm.kill_object(alloc1527) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias5) lv322: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1528, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1528) alloc1529: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1527: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), lv322, alloc1529) R.vm.kill_object(lv322) lv323: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1529, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1529) model_decoder_layers_25_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] model_decoder_layers_25_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1104] alloc1530: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv323, model_decoder_layers_25_encoder_attn_out_proj_weight5, model_decoder_layers_25_encoder_attn_out_proj_bias5, alloc1526, alloc1530) R.vm.kill_object(alloc1526) R.vm.kill_object(lv323) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias5) model_decoder_layers_25_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1111] model_decoder_layers_25_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1112] alloc1531: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1530, model_decoder_layers_25_final_layer_norm_weight5, model_decoder_layers_25_final_layer_norm_bias5, alloc1531) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias5) model_decoder_layers_25_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] model_decoder_layers_25_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1108] alloc1532: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1531, model_decoder_layers_25_fc1_weight5, model_decoder_layers_25_fc1_bias5, alloc1532) R.vm.kill_object(alloc1531) R.vm.kill_object(model_decoder_layers_25_fc1_weight5) R.vm.kill_object(model_decoder_layers_25_fc1_bias5) model_decoder_layers_25_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] model_decoder_layers_25_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1110] alloc1533: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1532, model_decoder_layers_25_fc2_weight5, model_decoder_layers_25_fc2_bias5, alloc1530, alloc1533) R.vm.kill_object(alloc1530) R.vm.kill_object(alloc1532) R.vm.kill_object(model_decoder_layers_25_fc2_weight5) R.vm.kill_object(model_decoder_layers_25_fc2_bias5) model_decoder_layers_26_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1120] model_decoder_layers_26_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1121] alloc1534: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1533, model_decoder_layers_26_self_attn_layer_norm_weight5, model_decoder_layers_26_self_attn_layer_norm_bias5, alloc1534) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias5) model_decoder_layers_26_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] model_decoder_layers_26_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1117] alloc1535: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1534, model_decoder_layers_26_self_attn_q_proj_weight5, model_decoder_layers_26_self_attn_q_proj_bias5, alloc1535) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias5) model_decoder_layers_26_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] alloc1536: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1534, model_decoder_layers_26_self_attn_k_proj_weight5, alloc1536) R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight5) model_decoder_layers_26_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] model_decoder_layers_26_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1115] alloc1537: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1534, model_decoder_layers_26_self_attn_v_proj_weight5, model_decoder_layers_26_self_attn_v_proj_bias5, alloc1537) R.vm.kill_object(alloc1534) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias5) alloc1538: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1535, alloc1536, alloc1537, alloc1538) R.vm.kill_object(alloc1535) R.vm.kill_object(alloc1536) R.vm.kill_object(alloc1537) alloc1539: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1537: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), alloc1538, alloc1539) R.vm.kill_object(alloc1538) lv330: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1539, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1539) model_decoder_layers_26_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] model_decoder_layers_26_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1119] alloc1540: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv330, model_decoder_layers_26_self_attn_out_proj_weight5, model_decoder_layers_26_self_attn_out_proj_bias5, alloc1533, alloc1540) R.vm.kill_object(alloc1533) R.vm.kill_object(lv330) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias5) model_decoder_layers_26_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1129] model_decoder_layers_26_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1130] alloc1541: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1540, model_decoder_layers_26_encoder_attn_layer_norm_weight5, model_decoder_layers_26_encoder_attn_layer_norm_bias5, alloc1541) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias5) model_decoder_layers_26_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] model_decoder_layers_26_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1126] alloc1542: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1541, model_decoder_layers_26_encoder_attn_q_proj_weight5, model_decoder_layers_26_encoder_attn_q_proj_bias5, alloc1542) R.vm.kill_object(alloc1541) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias5) lv333: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1542, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1542) alloc1543: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1541: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), lv333, alloc1543) R.vm.kill_object(lv333) lv334: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1543, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1543) model_decoder_layers_26_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] model_decoder_layers_26_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1128] alloc1544: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv334, model_decoder_layers_26_encoder_attn_out_proj_weight5, model_decoder_layers_26_encoder_attn_out_proj_bias5, alloc1540, alloc1544) R.vm.kill_object(alloc1540) R.vm.kill_object(lv334) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias5) model_decoder_layers_26_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1135] model_decoder_layers_26_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1136] alloc1545: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1544, model_decoder_layers_26_final_layer_norm_weight5, model_decoder_layers_26_final_layer_norm_bias5, alloc1545) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias5) model_decoder_layers_26_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] model_decoder_layers_26_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1132] alloc1546: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1545, model_decoder_layers_26_fc1_weight5, model_decoder_layers_26_fc1_bias5, alloc1546) R.vm.kill_object(alloc1545) R.vm.kill_object(model_decoder_layers_26_fc1_weight5) R.vm.kill_object(model_decoder_layers_26_fc1_bias5) model_decoder_layers_26_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] model_decoder_layers_26_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1134] alloc1547: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1546, model_decoder_layers_26_fc2_weight5, model_decoder_layers_26_fc2_bias5, alloc1544, alloc1547) R.vm.kill_object(alloc1544) R.vm.kill_object(alloc1546) R.vm.kill_object(model_decoder_layers_26_fc2_weight5) R.vm.kill_object(model_decoder_layers_26_fc2_bias5) model_decoder_layers_27_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1144] model_decoder_layers_27_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1145] alloc1548: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1547, model_decoder_layers_27_self_attn_layer_norm_weight5, model_decoder_layers_27_self_attn_layer_norm_bias5, alloc1548) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias5) model_decoder_layers_27_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] model_decoder_layers_27_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1141] alloc1549: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1548, model_decoder_layers_27_self_attn_q_proj_weight5, model_decoder_layers_27_self_attn_q_proj_bias5, alloc1549) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias5) model_decoder_layers_27_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] alloc1550: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1548, model_decoder_layers_27_self_attn_k_proj_weight5, alloc1550) R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight5) model_decoder_layers_27_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] model_decoder_layers_27_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1139] alloc1551: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1548, model_decoder_layers_27_self_attn_v_proj_weight5, model_decoder_layers_27_self_attn_v_proj_bias5, alloc1551) R.vm.kill_object(alloc1548) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias5) alloc1552: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1549, alloc1550, alloc1551, alloc1552) R.vm.kill_object(alloc1549) R.vm.kill_object(alloc1550) R.vm.kill_object(alloc1551) alloc1553: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1551: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), alloc1552, alloc1553) R.vm.kill_object(alloc1552) lv341: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1553, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1553) model_decoder_layers_27_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] model_decoder_layers_27_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1143] alloc1554: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv341, model_decoder_layers_27_self_attn_out_proj_weight5, model_decoder_layers_27_self_attn_out_proj_bias5, alloc1547, alloc1554) R.vm.kill_object(alloc1547) R.vm.kill_object(lv341) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias5) model_decoder_layers_27_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1153] model_decoder_layers_27_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1154] alloc1555: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1554, model_decoder_layers_27_encoder_attn_layer_norm_weight5, model_decoder_layers_27_encoder_attn_layer_norm_bias5, alloc1555) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias5) model_decoder_layers_27_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] model_decoder_layers_27_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1150] alloc1556: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1555, model_decoder_layers_27_encoder_attn_q_proj_weight5, model_decoder_layers_27_encoder_attn_q_proj_bias5, alloc1556) R.vm.kill_object(alloc1555) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias5) lv344: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1556, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1556) alloc1557: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1555: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), lv344, alloc1557) R.vm.kill_object(lv344) lv345: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1557, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1557) model_decoder_layers_27_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] model_decoder_layers_27_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1152] alloc1558: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv345, model_decoder_layers_27_encoder_attn_out_proj_weight5, model_decoder_layers_27_encoder_attn_out_proj_bias5, alloc1554, alloc1558) R.vm.kill_object(alloc1554) R.vm.kill_object(lv345) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias5) model_decoder_layers_27_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1159] model_decoder_layers_27_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1160] alloc1559: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1558, model_decoder_layers_27_final_layer_norm_weight5, model_decoder_layers_27_final_layer_norm_bias5, alloc1559) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias5) model_decoder_layers_27_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] model_decoder_layers_27_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1156] alloc1560: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1559, model_decoder_layers_27_fc1_weight5, model_decoder_layers_27_fc1_bias5, alloc1560) R.vm.kill_object(alloc1559) R.vm.kill_object(model_decoder_layers_27_fc1_weight5) R.vm.kill_object(model_decoder_layers_27_fc1_bias5) model_decoder_layers_27_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] model_decoder_layers_27_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1158] alloc1561: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1560, model_decoder_layers_27_fc2_weight5, model_decoder_layers_27_fc2_bias5, alloc1558, alloc1561) R.vm.kill_object(alloc1558) R.vm.kill_object(alloc1560) R.vm.kill_object(model_decoder_layers_27_fc2_weight5) R.vm.kill_object(model_decoder_layers_27_fc2_bias5) model_decoder_layers_28_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1168] model_decoder_layers_28_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1169] alloc1562: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1561, model_decoder_layers_28_self_attn_layer_norm_weight5, model_decoder_layers_28_self_attn_layer_norm_bias5, alloc1562) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias5) model_decoder_layers_28_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] model_decoder_layers_28_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1165] alloc1563: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1562, model_decoder_layers_28_self_attn_q_proj_weight5, model_decoder_layers_28_self_attn_q_proj_bias5, alloc1563) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias5) model_decoder_layers_28_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] alloc1564: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1562, model_decoder_layers_28_self_attn_k_proj_weight5, alloc1564) R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight5) model_decoder_layers_28_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] model_decoder_layers_28_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1163] alloc1565: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1562, model_decoder_layers_28_self_attn_v_proj_weight5, model_decoder_layers_28_self_attn_v_proj_bias5, alloc1565) R.vm.kill_object(alloc1562) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias5) alloc1566: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1563, alloc1564, alloc1565, alloc1566) R.vm.kill_object(alloc1563) R.vm.kill_object(alloc1564) R.vm.kill_object(alloc1565) alloc1567: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1565: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), alloc1566, alloc1567) R.vm.kill_object(alloc1566) lv352: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1567, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1567) model_decoder_layers_28_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] model_decoder_layers_28_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1167] alloc1568: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv352, model_decoder_layers_28_self_attn_out_proj_weight5, model_decoder_layers_28_self_attn_out_proj_bias5, alloc1561, alloc1568) R.vm.kill_object(alloc1561) R.vm.kill_object(lv352) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias5) model_decoder_layers_28_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1177] model_decoder_layers_28_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1178] alloc1569: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1568, model_decoder_layers_28_encoder_attn_layer_norm_weight5, model_decoder_layers_28_encoder_attn_layer_norm_bias5, alloc1569) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias5) model_decoder_layers_28_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] model_decoder_layers_28_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1174] alloc1570: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1569, model_decoder_layers_28_encoder_attn_q_proj_weight5, model_decoder_layers_28_encoder_attn_q_proj_bias5, alloc1570) R.vm.kill_object(alloc1569) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias5) lv355: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1570, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1570) alloc1571: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1569: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), lv355, alloc1571) R.vm.kill_object(lv355) lv356: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1571, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1571) model_decoder_layers_28_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] model_decoder_layers_28_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1176] alloc1572: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv356, model_decoder_layers_28_encoder_attn_out_proj_weight5, model_decoder_layers_28_encoder_attn_out_proj_bias5, alloc1568, alloc1572) R.vm.kill_object(alloc1568) R.vm.kill_object(lv356) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias5) model_decoder_layers_28_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1183] model_decoder_layers_28_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1184] alloc1573: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1572, model_decoder_layers_28_final_layer_norm_weight5, model_decoder_layers_28_final_layer_norm_bias5, alloc1573) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias5) model_decoder_layers_28_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] model_decoder_layers_28_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1180] alloc1574: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1573, model_decoder_layers_28_fc1_weight5, model_decoder_layers_28_fc1_bias5, alloc1574) R.vm.kill_object(alloc1573) R.vm.kill_object(model_decoder_layers_28_fc1_weight5) R.vm.kill_object(model_decoder_layers_28_fc1_bias5) model_decoder_layers_28_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] model_decoder_layers_28_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1182] alloc1575: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1574, model_decoder_layers_28_fc2_weight5, model_decoder_layers_28_fc2_bias5, alloc1572, alloc1575) R.vm.kill_object(alloc1572) R.vm.kill_object(alloc1574) R.vm.kill_object(model_decoder_layers_28_fc2_weight5) R.vm.kill_object(model_decoder_layers_28_fc2_bias5) model_decoder_layers_29_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1192] model_decoder_layers_29_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1193] alloc1576: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1575, model_decoder_layers_29_self_attn_layer_norm_weight5, model_decoder_layers_29_self_attn_layer_norm_bias5, alloc1576) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias5) model_decoder_layers_29_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] model_decoder_layers_29_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1189] alloc1577: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1576, model_decoder_layers_29_self_attn_q_proj_weight5, model_decoder_layers_29_self_attn_q_proj_bias5, alloc1577) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias5) model_decoder_layers_29_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] alloc1578: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1576, model_decoder_layers_29_self_attn_k_proj_weight5, alloc1578) R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight5) model_decoder_layers_29_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] model_decoder_layers_29_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1187] alloc1579: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1576, model_decoder_layers_29_self_attn_v_proj_weight5, model_decoder_layers_29_self_attn_v_proj_bias5, alloc1579) R.vm.kill_object(alloc1576) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias5) alloc1580: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1577, alloc1578, alloc1579, alloc1580) R.vm.kill_object(alloc1577) R.vm.kill_object(alloc1578) R.vm.kill_object(alloc1579) alloc1581: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1579: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), alloc1580, alloc1581) R.vm.kill_object(alloc1580) lv363: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1581, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1581) model_decoder_layers_29_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] model_decoder_layers_29_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1191] alloc1582: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv363, model_decoder_layers_29_self_attn_out_proj_weight5, model_decoder_layers_29_self_attn_out_proj_bias5, alloc1575, alloc1582) R.vm.kill_object(alloc1575) R.vm.kill_object(lv363) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias5) model_decoder_layers_29_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1201] model_decoder_layers_29_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1202] alloc1583: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1582, model_decoder_layers_29_encoder_attn_layer_norm_weight5, model_decoder_layers_29_encoder_attn_layer_norm_bias5, alloc1583) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias5) model_decoder_layers_29_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] model_decoder_layers_29_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1198] alloc1584: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1583, model_decoder_layers_29_encoder_attn_q_proj_weight5, model_decoder_layers_29_encoder_attn_q_proj_bias5, alloc1584) R.vm.kill_object(alloc1583) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias5) lv366: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1584, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1584) alloc1585: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1583: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), lv366, alloc1585) R.vm.kill_object(lv366) lv367: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1585, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1585) model_decoder_layers_29_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] model_decoder_layers_29_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1200] alloc1586: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv367, model_decoder_layers_29_encoder_attn_out_proj_weight5, model_decoder_layers_29_encoder_attn_out_proj_bias5, alloc1582, alloc1586) R.vm.kill_object(alloc1582) R.vm.kill_object(lv367) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias5) model_decoder_layers_29_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1207] model_decoder_layers_29_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1208] alloc1587: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1586, model_decoder_layers_29_final_layer_norm_weight5, model_decoder_layers_29_final_layer_norm_bias5, alloc1587) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias5) model_decoder_layers_29_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] model_decoder_layers_29_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1204] alloc1588: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1587, model_decoder_layers_29_fc1_weight5, model_decoder_layers_29_fc1_bias5, alloc1588) R.vm.kill_object(alloc1587) R.vm.kill_object(model_decoder_layers_29_fc1_weight5) R.vm.kill_object(model_decoder_layers_29_fc1_bias5) model_decoder_layers_29_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] model_decoder_layers_29_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1206] alloc1589: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1588, model_decoder_layers_29_fc2_weight5, model_decoder_layers_29_fc2_bias5, alloc1586, alloc1589) R.vm.kill_object(alloc1586) R.vm.kill_object(alloc1588) R.vm.kill_object(model_decoder_layers_29_fc2_weight5) R.vm.kill_object(model_decoder_layers_29_fc2_bias5) model_decoder_layers_30_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1216] model_decoder_layers_30_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1217] alloc1590: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1589, model_decoder_layers_30_self_attn_layer_norm_weight5, model_decoder_layers_30_self_attn_layer_norm_bias5, alloc1590) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias5) model_decoder_layers_30_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] model_decoder_layers_30_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1213] alloc1591: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1590, model_decoder_layers_30_self_attn_q_proj_weight5, model_decoder_layers_30_self_attn_q_proj_bias5, alloc1591) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias5) model_decoder_layers_30_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] alloc1592: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1590, model_decoder_layers_30_self_attn_k_proj_weight5, alloc1592) R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight5) model_decoder_layers_30_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] model_decoder_layers_30_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1211] alloc1593: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1590, model_decoder_layers_30_self_attn_v_proj_weight5, model_decoder_layers_30_self_attn_v_proj_bias5, alloc1593) R.vm.kill_object(alloc1590) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias5) alloc1594: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1591, alloc1592, alloc1593, alloc1594) R.vm.kill_object(alloc1591) R.vm.kill_object(alloc1592) R.vm.kill_object(alloc1593) alloc1595: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1593: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), alloc1594, alloc1595) R.vm.kill_object(alloc1594) lv374: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1595, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1595) model_decoder_layers_30_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] model_decoder_layers_30_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1215] alloc1596: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv374, model_decoder_layers_30_self_attn_out_proj_weight5, model_decoder_layers_30_self_attn_out_proj_bias5, alloc1589, alloc1596) R.vm.kill_object(alloc1589) R.vm.kill_object(lv374) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias5) model_decoder_layers_30_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1225] model_decoder_layers_30_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1226] alloc1597: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1596, model_decoder_layers_30_encoder_attn_layer_norm_weight5, model_decoder_layers_30_encoder_attn_layer_norm_bias5, alloc1597) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias5) model_decoder_layers_30_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] model_decoder_layers_30_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1222] alloc1598: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1597, model_decoder_layers_30_encoder_attn_q_proj_weight5, model_decoder_layers_30_encoder_attn_q_proj_bias5, alloc1598) R.vm.kill_object(alloc1597) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias5) lv377: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1598, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1598) alloc1599: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1597: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), lv377, alloc1599) R.vm.kill_object(lv377) lv378: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1599, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1599) model_decoder_layers_30_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] model_decoder_layers_30_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1224] alloc1600: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7_add6(lv378, model_decoder_layers_30_encoder_attn_out_proj_weight5, model_decoder_layers_30_encoder_attn_out_proj_bias5, alloc1596, alloc1600) R.vm.kill_object(alloc1596) R.vm.kill_object(lv378) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias5) model_decoder_layers_30_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1231] model_decoder_layers_30_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1232] alloc1601: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1600, model_decoder_layers_30_final_layer_norm_weight5, model_decoder_layers_30_final_layer_norm_bias5, alloc1601) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias5) model_decoder_layers_30_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] model_decoder_layers_30_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1228] alloc1602: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) cls.fused_NT_matmul1_add8_gelu2(alloc1601, model_decoder_layers_30_fc1_weight5, model_decoder_layers_30_fc1_bias5, alloc1602) R.vm.kill_object(alloc1601) R.vm.kill_object(model_decoder_layers_30_fc1_weight5) R.vm.kill_object(model_decoder_layers_30_fc1_bias5) model_decoder_layers_30_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] model_decoder_layers_30_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1230] alloc1603: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul2_add7_add6(alloc1602, model_decoder_layers_30_fc2_weight5, model_decoder_layers_30_fc2_bias5, alloc1600, alloc1603) R.vm.kill_object(alloc1600) R.vm.kill_object(alloc1602) R.vm.kill_object(model_decoder_layers_30_fc2_weight5) R.vm.kill_object(model_decoder_layers_30_fc2_bias5) model_decoder_layers_31_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1240] model_decoder_layers_31_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1241] alloc1604: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1603, model_decoder_layers_31_self_attn_layer_norm_weight5, model_decoder_layers_31_self_attn_layer_norm_bias5, alloc1604) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias5) model_decoder_layers_31_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] model_decoder_layers_31_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1237] alloc1605: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1604, model_decoder_layers_31_self_attn_q_proj_weight5, model_decoder_layers_31_self_attn_q_proj_bias5, alloc1605) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias5) model_decoder_layers_31_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] alloc1606: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.NT_matmul(alloc1604, model_decoder_layers_31_self_attn_k_proj_weight5, alloc1606) R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight5) model_decoder_layers_31_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] model_decoder_layers_31_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1235] alloc1607: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1604, model_decoder_layers_31_self_attn_v_proj_weight5, model_decoder_layers_31_self_attn_v_proj_bias5, alloc1607) R.vm.kill_object(alloc1604) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight5) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias5) alloc1608: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1605, alloc1606, alloc1607, alloc1608) R.vm.kill_object(alloc1605) R.vm.kill_object(alloc1606) R.vm.kill_object(alloc1607) alloc1609: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1607: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), alloc1608, alloc1609) R.vm.kill_object(alloc1608) lv385: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1609, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1609) model_decoder_layers_31_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] model_decoder_layers_31_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1239] alloc1610: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) R.vm.kill_object(storage22) cls.fused_NT_matmul_add7_add6(lv385, model_decoder_layers_31_self_attn_out_proj_weight5, model_decoder_layers_31_self_attn_out_proj_bias5, alloc1603, alloc1610) R.vm.kill_object(alloc1603) R.vm.kill_object(lv385) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias5) model_decoder_layers_31_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1249] model_decoder_layers_31_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1250] alloc1611: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1610, model_decoder_layers_31_encoder_attn_layer_norm_weight5, model_decoder_layers_31_encoder_attn_layer_norm_bias5, alloc1611) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias5) model_decoder_layers_31_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] model_decoder_layers_31_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1246] alloc1612: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.fused_NT_matmul_add7(alloc1611, model_decoder_layers_31_encoder_attn_q_proj_weight5, model_decoder_layers_31_encoder_attn_q_proj_bias5, alloc1612) R.vm.kill_object(alloc1611) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight5) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias5) lv388: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1612, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1612) alloc1613: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) _1611: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), lv388, alloc1613) R.vm.kill_object(lv388) lv389: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1613, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) R.vm.kill_object(alloc1613) model_decoder_layers_31_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] model_decoder_layers_31_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1248] alloc1614: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) R.vm.kill_object(storage20) cls.fused_NT_matmul_add7_add6(lv389, model_decoder_layers_31_encoder_attn_out_proj_weight5, model_decoder_layers_31_encoder_attn_out_proj_bias5, alloc1610, alloc1614) R.vm.kill_object(alloc1610) R.vm.kill_object(lv389) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight5) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias5) model_decoder_layers_31_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1255] model_decoder_layers_31_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1256] alloc1615: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) cls.layer_norm3(alloc1614, model_decoder_layers_31_final_layer_norm_weight5, model_decoder_layers_31_final_layer_norm_bias5, alloc1615) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight5) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias5) model_decoder_layers_31_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] model_decoder_layers_31_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1252] alloc1616: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) R.vm.kill_object(storage19) cls.fused_NT_matmul1_add8_gelu2(alloc1615, model_decoder_layers_31_fc1_weight5, model_decoder_layers_31_fc1_bias5, alloc1616) R.vm.kill_object(alloc1615) R.vm.kill_object(model_decoder_layers_31_fc1_weight5) R.vm.kill_object(model_decoder_layers_31_fc1_bias5) model_decoder_layers_31_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] model_decoder_layers_31_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1254] alloc1617: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) R.vm.kill_object(storage21) cls.fused_NT_matmul2_add7_add6(alloc1616, model_decoder_layers_31_fc2_weight5, model_decoder_layers_31_fc2_bias5, alloc1614, alloc1617) R.vm.kill_object(alloc1614) R.vm.kill_object(alloc1616) R.vm.kill_object(model_decoder_layers_31_fc2_weight5) R.vm.kill_object(model_decoder_layers_31_fc2_bias5) model_decoder_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1257] model_decoder_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1258] alloc1618: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) R.vm.kill_object(storage23) cls.layer_norm3(alloc1617, model_decoder_layer_norm_weight5, model_decoder_layer_norm_bias5, alloc1618) R.vm.kill_object(alloc1617) R.vm.kill_object(model_decoder_layer_norm_weight5) R.vm.kill_object(model_decoder_layer_norm_bias5) storage: R.Object = R.vm.alloc_storage(R.shape([207464]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc1619: R.Tensor((1, 1, 51866), dtype="float32") = R.vm.alloc_tensor(storage, R.prim_value(0), R.shape([1, 1, 51866]), R.dtype("float32")) R.vm.kill_object(storage) cls.NT_matmul3(alloc1618, model_decoder_embed_tokens_weight5, alloc1619) R.vm.kill_object(model_decoder_embed_tokens_weight5) R.vm.kill_object(alloc1618) return alloc1619 @R.function def multinomial_from_uniform(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), uniform_samples: R.Tensor(("num_samples",), dtype="float32"), sample_indices: R.Tensor(("num_samples",), dtype="int32")) -> R.Tensor(("num_samples",), dtype="int32"): num_samples = T.int64() batch_size = T.int64() vocab_size = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[1], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[1], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) gv6: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) uniform_samples_1: R.Tensor((num_samples, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", uniform_samples, gv6, sinfo_args=(R.Tensor((num_samples, 1), dtype="float32"),)) gv7: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) sample_indices_1: R.Tensor((num_samples, 1), dtype="int32") = R.call_packed("vm.builtin.reshape", sample_indices, gv7, sinfo_args=(R.Tensor((num_samples, 1), dtype="int32"),)) storage3: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv8: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc3: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage3, R.prim_value(0), gv8, R.dtype("int32")) R.vm.kill_object(storage3) cls.parallel_sampling_from_prob(probs, uniform_samples_1, sample_indices_1, alloc3) R.vm.kill_object(uniform_samples_1) R.vm.kill_object(sample_indices_1) gv9: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) gv: R.Tensor((num_samples,), dtype="int32") = R.call_packed("vm.builtin.reshape", alloc3, gv9, sinfo_args=(R.Tensor((num_samples,), dtype="int32"),)) R.vm.kill_object(alloc3) return gv @R.function def prefill(input_ids: R.Tensor((1, "seq_len"), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, 1, 51866), dtype="float32"): seq_len = T.int64() R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=prefill, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) model_decoder_embed_tokens_weight4: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] gv2580: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) reshape1030: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv2580, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) model_decoder_embed_tokens_weight4_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] storage37: R.Object = R.vm.alloc_storage(R.shape([153600000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2581: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) alloc1982: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2581, R.dtype("float16")) cls.take(model_decoder_embed_tokens_weight4_1, reshape1030, alloc1982) R.vm.kill_object(reshape1030) R.vm.kill_object(model_decoder_embed_tokens_weight4_1) gv2582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1031: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1982, gv2582, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(alloc1982) lv198: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) model_decoder_embed_positions_weight4: R.Tensor((448, 1280), dtype="float16") = packed_params[488] storage38: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2583: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) alloc1983: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2583, R.dtype("float16")) cls.take1(model_decoder_embed_positions_weight4, lv198, alloc1983) R.vm.kill_object(lv198) R.vm.kill_object(model_decoder_embed_positions_weight4) gv2584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1032: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1983, gv2584, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(alloc1983) storage39: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2585: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1984: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2585, R.dtype("float16")) cls.add5(reshape1031, reshape1032, alloc1984) R.vm.kill_object(reshape1031) R.vm.kill_object(reshape1032) model_decoder_layers_0_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[496] model_decoder_layers_0_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[497] gv2586: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1985: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2586, R.dtype("float16")) cls.layer_norm2(alloc1984, model_decoder_layers_0_self_attn_layer_norm_weight4, model_decoder_layers_0_self_attn_layer_norm_bias4, alloc1985) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias4) model_decoder_layers_0_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] model_decoder_layers_0_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[493] gv2587: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1986: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2587, R.dtype("float16")) _1985: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_q_proj_weight4, alloc1985, model_decoder_layers_0_self_attn_q_proj_bias4, alloc1986) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias4) gv2588: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1033: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1986, gv2588, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1986) model_decoder_layers_0_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] storage40: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2589: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1987: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2589, R.dtype("float16")) _1986: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_0_self_attn_k_proj_weight4, alloc1985, alloc1987) R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight4) gv2590: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1034: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1987, gv2590, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1987) model_decoder_layers_0_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] model_decoder_layers_0_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[491] storage41: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1988: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2591, R.dtype("float16")) _1987: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_v_proj_weight4, alloc1985, model_decoder_layers_0_self_attn_v_proj_bias4, alloc1988) R.vm.kill_object(alloc1985) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias4) gv2592: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1035: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1988, gv2592, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1988) gv2593: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc1989: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2593, R.dtype("float16")) cls.concatenate1(reshape1033, reshape1034, reshape1035, alloc1989) R.vm.kill_object(reshape1033) R.vm.kill_object(reshape1034) R.vm.kill_object(reshape1035) gv2594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1036: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1989, gv2594, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc1989) gv2595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1990: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2595, R.dtype("float16")) _1989: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape1036, alloc1990) R.vm.kill_object(reshape1036) gv2596: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1037: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1990, gv2596, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1990) gv2597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1038: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1037, gv2597, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1037) model_decoder_layers_0_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] model_decoder_layers_0_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[495] gv2598: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1991: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2598, R.dtype("float16")) _1990: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_out_proj_weight4, reshape1038, model_decoder_layers_0_self_attn_out_proj_bias4, alloc1991) R.vm.kill_object(reshape1038) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias4) gv2599: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1992: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2599, R.dtype("float16")) cls.add5(alloc1984, alloc1991, alloc1992) R.vm.kill_object(alloc1984) R.vm.kill_object(alloc1991) model_decoder_layers_0_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[505] model_decoder_layers_0_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[506] gv2600: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1993: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2600, R.dtype("float16")) cls.layer_norm2(alloc1992, model_decoder_layers_0_encoder_attn_layer_norm_weight4, model_decoder_layers_0_encoder_attn_layer_norm_bias4, alloc1993) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias4) model_decoder_layers_0_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] model_decoder_layers_0_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[502] gv2601: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1994: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2601, R.dtype("float16")) _1993: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight4, alloc1993, model_decoder_layers_0_encoder_attn_q_proj_bias4, alloc1994) R.vm.kill_object(alloc1993) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias4) gv2602: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1039: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1994, gv2602, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1994) gv2603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1040: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1039, gv2603, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1039) gv2604: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc1995: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2604, R.dtype("float16")) _1994: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape1040, alloc1995) R.vm.kill_object(reshape1040) gv2605: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1041: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1995, gv2605, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc1995) gv2606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1042: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1041, gv2606, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1041) model_decoder_layers_0_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] model_decoder_layers_0_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[504] gv2607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1996: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2607, R.dtype("float16")) _1995: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight4, reshape1042, model_decoder_layers_0_encoder_attn_out_proj_bias4, alloc1996) R.vm.kill_object(reshape1042) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias4) gv2608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1997: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2608, R.dtype("float16")) cls.add5(alloc1992, alloc1996, alloc1997) R.vm.kill_object(alloc1992) R.vm.kill_object(alloc1996) model_decoder_layers_0_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[511] model_decoder_layers_0_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[512] gv2609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc1998: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2609, R.dtype("float16")) cls.layer_norm2(alloc1997, model_decoder_layers_0_final_layer_norm_weight4, model_decoder_layers_0_final_layer_norm_bias4, alloc1998) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias4) model_decoder_layers_0_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] model_decoder_layers_0_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[508] gv2610: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc1999: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2610, R.dtype("float16")) _1998: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_0_fc1_weight4, alloc1998, model_decoder_layers_0_fc1_bias4, alloc1999) R.vm.kill_object(alloc1998) R.vm.kill_object(model_decoder_layers_0_fc1_weight4) R.vm.kill_object(model_decoder_layers_0_fc1_bias4) model_decoder_layers_0_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] model_decoder_layers_0_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[510] gv2611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2000: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2611, R.dtype("float16")) _1999: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_0_fc2_weight4, alloc1999, model_decoder_layers_0_fc2_bias4, alloc2000) R.vm.kill_object(alloc1999) R.vm.kill_object(model_decoder_layers_0_fc2_weight4) R.vm.kill_object(model_decoder_layers_0_fc2_bias4) gv2612: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2001: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2612, R.dtype("float16")) cls.add5(alloc1997, alloc2000, alloc2001) R.vm.kill_object(alloc1997) R.vm.kill_object(alloc2000) model_decoder_layers_1_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[520] model_decoder_layers_1_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[521] gv2613: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2002: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2613, R.dtype("float16")) cls.layer_norm2(alloc2001, model_decoder_layers_1_self_attn_layer_norm_weight4, model_decoder_layers_1_self_attn_layer_norm_bias4, alloc2002) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias4) model_decoder_layers_1_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] model_decoder_layers_1_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[517] gv2614: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2003: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2614, R.dtype("float16")) _2002: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_q_proj_weight4, alloc2002, model_decoder_layers_1_self_attn_q_proj_bias4, alloc2003) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias4) gv2615: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1043: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2003, gv2615, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2003) model_decoder_layers_1_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] gv2616: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2004: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2616, R.dtype("float16")) _2003: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_1_self_attn_k_proj_weight4, alloc2002, alloc2004) R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight4) gv2617: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1044: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2004, gv2617, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2004) model_decoder_layers_1_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] model_decoder_layers_1_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[515] gv2618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2005: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2618, R.dtype("float16")) _2004: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_v_proj_weight4, alloc2002, model_decoder_layers_1_self_attn_v_proj_bias4, alloc2005) R.vm.kill_object(alloc2002) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias4) gv2619: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1045: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2005, gv2619, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2005) gv2620: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2006: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2620, R.dtype("float16")) cls.concatenate1(reshape1043, reshape1044, reshape1045, alloc2006) R.vm.kill_object(reshape1043) R.vm.kill_object(reshape1044) R.vm.kill_object(reshape1045) gv2621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1046: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2006, gv2621, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2006) gv2622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2007: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2622, R.dtype("float16")) _2006: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape1046, alloc2007) R.vm.kill_object(reshape1046) gv2623: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1047: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2007, gv2623, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2007) gv2624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1048: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1047, gv2624, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1047) model_decoder_layers_1_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] model_decoder_layers_1_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[519] gv2625: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2008: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2625, R.dtype("float16")) _2007: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_out_proj_weight4, reshape1048, model_decoder_layers_1_self_attn_out_proj_bias4, alloc2008) R.vm.kill_object(reshape1048) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias4) gv2626: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2009: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2626, R.dtype("float16")) cls.add5(alloc2001, alloc2008, alloc2009) R.vm.kill_object(alloc2001) R.vm.kill_object(alloc2008) model_decoder_layers_1_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[529] model_decoder_layers_1_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[530] gv2627: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2010: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2627, R.dtype("float16")) cls.layer_norm2(alloc2009, model_decoder_layers_1_encoder_attn_layer_norm_weight4, model_decoder_layers_1_encoder_attn_layer_norm_bias4, alloc2010) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias4) model_decoder_layers_1_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] model_decoder_layers_1_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[526] gv2628: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2011: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2628, R.dtype("float16")) _2010: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight4, alloc2010, model_decoder_layers_1_encoder_attn_q_proj_bias4, alloc2011) R.vm.kill_object(alloc2010) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias4) gv2629: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1049: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2011, gv2629, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2011) gv2630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1050: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1049, gv2630, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1049) gv2631: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2012: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2631, R.dtype("float16")) _2011: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape1050, alloc2012) R.vm.kill_object(reshape1050) gv2632: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1051: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2012, gv2632, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2012) gv2633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1052: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1051, gv2633, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1051) model_decoder_layers_1_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] model_decoder_layers_1_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[528] gv2634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2013: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2634, R.dtype("float16")) _2012: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight4, reshape1052, model_decoder_layers_1_encoder_attn_out_proj_bias4, alloc2013) R.vm.kill_object(reshape1052) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias4) gv2635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2014: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2635, R.dtype("float16")) cls.add5(alloc2009, alloc2013, alloc2014) R.vm.kill_object(alloc2009) R.vm.kill_object(alloc2013) model_decoder_layers_1_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[535] model_decoder_layers_1_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[536] gv2636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2015: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2636, R.dtype("float16")) cls.layer_norm2(alloc2014, model_decoder_layers_1_final_layer_norm_weight4, model_decoder_layers_1_final_layer_norm_bias4, alloc2015) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias4) model_decoder_layers_1_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] model_decoder_layers_1_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[532] gv2637: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2016: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2637, R.dtype("float16")) _2015: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_1_fc1_weight4, alloc2015, model_decoder_layers_1_fc1_bias4, alloc2016) R.vm.kill_object(alloc2015) R.vm.kill_object(model_decoder_layers_1_fc1_weight4) R.vm.kill_object(model_decoder_layers_1_fc1_bias4) model_decoder_layers_1_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] model_decoder_layers_1_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[534] gv2638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2017: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2638, R.dtype("float16")) _2016: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_1_fc2_weight4, alloc2016, model_decoder_layers_1_fc2_bias4, alloc2017) R.vm.kill_object(alloc2016) R.vm.kill_object(model_decoder_layers_1_fc2_weight4) R.vm.kill_object(model_decoder_layers_1_fc2_bias4) gv2639: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2018: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2639, R.dtype("float16")) cls.add5(alloc2014, alloc2017, alloc2018) R.vm.kill_object(alloc2014) R.vm.kill_object(alloc2017) model_decoder_layers_2_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[544] model_decoder_layers_2_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[545] gv2640: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2019: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2640, R.dtype("float16")) cls.layer_norm2(alloc2018, model_decoder_layers_2_self_attn_layer_norm_weight4, model_decoder_layers_2_self_attn_layer_norm_bias4, alloc2019) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias4) model_decoder_layers_2_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] model_decoder_layers_2_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[541] gv2641: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2020: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2641, R.dtype("float16")) _2019: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_q_proj_weight4, alloc2019, model_decoder_layers_2_self_attn_q_proj_bias4, alloc2020) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias4) gv2642: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1053: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2020, gv2642, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2020) model_decoder_layers_2_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] gv2643: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2021: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2643, R.dtype("float16")) _2020: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_2_self_attn_k_proj_weight4, alloc2019, alloc2021) R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight4) gv2644: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1054: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2021, gv2644, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2021) model_decoder_layers_2_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] model_decoder_layers_2_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[539] gv2645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2022: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2645, R.dtype("float16")) _2021: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_v_proj_weight4, alloc2019, model_decoder_layers_2_self_attn_v_proj_bias4, alloc2022) R.vm.kill_object(alloc2019) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias4) gv2646: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1055: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2022, gv2646, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2022) gv2647: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2023: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2647, R.dtype("float16")) cls.concatenate1(reshape1053, reshape1054, reshape1055, alloc2023) R.vm.kill_object(reshape1053) R.vm.kill_object(reshape1054) R.vm.kill_object(reshape1055) gv2648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1056: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2023, gv2648, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2023) gv2649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2024: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2649, R.dtype("float16")) _2023: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape1056, alloc2024) R.vm.kill_object(reshape1056) gv2650: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1057: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2024, gv2650, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2024) gv2651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1058: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1057, gv2651, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1057) model_decoder_layers_2_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] model_decoder_layers_2_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[543] gv2652: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2025: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2652, R.dtype("float16")) _2024: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_out_proj_weight4, reshape1058, model_decoder_layers_2_self_attn_out_proj_bias4, alloc2025) R.vm.kill_object(reshape1058) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias4) gv2653: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2026: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2653, R.dtype("float16")) cls.add5(alloc2018, alloc2025, alloc2026) R.vm.kill_object(alloc2018) R.vm.kill_object(alloc2025) model_decoder_layers_2_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[553] model_decoder_layers_2_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[554] gv2654: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2027: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2654, R.dtype("float16")) cls.layer_norm2(alloc2026, model_decoder_layers_2_encoder_attn_layer_norm_weight4, model_decoder_layers_2_encoder_attn_layer_norm_bias4, alloc2027) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias4) model_decoder_layers_2_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] model_decoder_layers_2_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[550] gv2655: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2028: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2655, R.dtype("float16")) _2027: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight4, alloc2027, model_decoder_layers_2_encoder_attn_q_proj_bias4, alloc2028) R.vm.kill_object(alloc2027) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias4) gv2656: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1059: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2028, gv2656, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2028) gv2657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1060: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1059, gv2657, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1059) gv2658: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2029: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2658, R.dtype("float16")) _2028: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape1060, alloc2029) R.vm.kill_object(reshape1060) gv2659: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1061: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2029, gv2659, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2029) gv2660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1062: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1061, gv2660, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1061) model_decoder_layers_2_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] model_decoder_layers_2_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[552] gv2661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2030: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2661, R.dtype("float16")) _2029: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight4, reshape1062, model_decoder_layers_2_encoder_attn_out_proj_bias4, alloc2030) R.vm.kill_object(reshape1062) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias4) gv2662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2031: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2662, R.dtype("float16")) cls.add5(alloc2026, alloc2030, alloc2031) R.vm.kill_object(alloc2026) R.vm.kill_object(alloc2030) model_decoder_layers_2_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[559] model_decoder_layers_2_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[560] gv2663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2032: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2663, R.dtype("float16")) cls.layer_norm2(alloc2031, model_decoder_layers_2_final_layer_norm_weight4, model_decoder_layers_2_final_layer_norm_bias4, alloc2032) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias4) model_decoder_layers_2_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] model_decoder_layers_2_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[556] gv2664: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2033: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2664, R.dtype("float16")) _2032: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_2_fc1_weight4, alloc2032, model_decoder_layers_2_fc1_bias4, alloc2033) R.vm.kill_object(alloc2032) R.vm.kill_object(model_decoder_layers_2_fc1_weight4) R.vm.kill_object(model_decoder_layers_2_fc1_bias4) model_decoder_layers_2_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] model_decoder_layers_2_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[558] gv2665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2034: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2665, R.dtype("float16")) _2033: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_2_fc2_weight4, alloc2033, model_decoder_layers_2_fc2_bias4, alloc2034) R.vm.kill_object(alloc2033) R.vm.kill_object(model_decoder_layers_2_fc2_weight4) R.vm.kill_object(model_decoder_layers_2_fc2_bias4) gv2666: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2035: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2666, R.dtype("float16")) cls.add5(alloc2031, alloc2034, alloc2035) R.vm.kill_object(alloc2031) R.vm.kill_object(alloc2034) model_decoder_layers_3_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[568] model_decoder_layers_3_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[569] gv2667: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2036: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2667, R.dtype("float16")) cls.layer_norm2(alloc2035, model_decoder_layers_3_self_attn_layer_norm_weight4, model_decoder_layers_3_self_attn_layer_norm_bias4, alloc2036) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias4) model_decoder_layers_3_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] model_decoder_layers_3_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[565] gv2668: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2037: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2668, R.dtype("float16")) _2036: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_q_proj_weight4, alloc2036, model_decoder_layers_3_self_attn_q_proj_bias4, alloc2037) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias4) gv2669: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1063: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2037, gv2669, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2037) model_decoder_layers_3_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] gv2670: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2038: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2670, R.dtype("float16")) _2037: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_3_self_attn_k_proj_weight4, alloc2036, alloc2038) R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight4) gv2671: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1064: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2038, gv2671, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2038) model_decoder_layers_3_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] model_decoder_layers_3_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[563] gv2672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2039: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2672, R.dtype("float16")) _2038: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_v_proj_weight4, alloc2036, model_decoder_layers_3_self_attn_v_proj_bias4, alloc2039) R.vm.kill_object(alloc2036) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias4) gv2673: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1065: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2039, gv2673, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2039) gv2674: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2040: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2674, R.dtype("float16")) cls.concatenate1(reshape1063, reshape1064, reshape1065, alloc2040) R.vm.kill_object(reshape1063) R.vm.kill_object(reshape1064) R.vm.kill_object(reshape1065) gv2675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1066: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2040, gv2675, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2040) gv2676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2041: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2676, R.dtype("float16")) _2040: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape1066, alloc2041) R.vm.kill_object(reshape1066) gv2677: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1067: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2041, gv2677, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2041) gv2678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1068: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1067, gv2678, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1067) model_decoder_layers_3_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] model_decoder_layers_3_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[567] gv2679: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2042: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2679, R.dtype("float16")) _2041: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_out_proj_weight4, reshape1068, model_decoder_layers_3_self_attn_out_proj_bias4, alloc2042) R.vm.kill_object(reshape1068) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias4) gv2680: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2043: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2680, R.dtype("float16")) cls.add5(alloc2035, alloc2042, alloc2043) R.vm.kill_object(alloc2035) R.vm.kill_object(alloc2042) model_decoder_layers_3_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[577] model_decoder_layers_3_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[578] gv2681: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2044: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2681, R.dtype("float16")) cls.layer_norm2(alloc2043, model_decoder_layers_3_encoder_attn_layer_norm_weight4, model_decoder_layers_3_encoder_attn_layer_norm_bias4, alloc2044) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias4) model_decoder_layers_3_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] model_decoder_layers_3_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[574] gv2682: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2045: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2682, R.dtype("float16")) _2044: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight4, alloc2044, model_decoder_layers_3_encoder_attn_q_proj_bias4, alloc2045) R.vm.kill_object(alloc2044) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias4) gv2683: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1069: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2045, gv2683, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2045) gv2684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1070: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1069, gv2684, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1069) gv2685: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2046: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2685, R.dtype("float16")) _2045: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape1070, alloc2046) R.vm.kill_object(reshape1070) gv2686: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1071: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2046, gv2686, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2046) gv2687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1072: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1071, gv2687, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1071) model_decoder_layers_3_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] model_decoder_layers_3_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[576] gv2688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2047: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2688, R.dtype("float16")) _2046: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight4, reshape1072, model_decoder_layers_3_encoder_attn_out_proj_bias4, alloc2047) R.vm.kill_object(reshape1072) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias4) gv2689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2048: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2689, R.dtype("float16")) cls.add5(alloc2043, alloc2047, alloc2048) R.vm.kill_object(alloc2043) R.vm.kill_object(alloc2047) model_decoder_layers_3_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[583] model_decoder_layers_3_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[584] gv2690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2049: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2690, R.dtype("float16")) cls.layer_norm2(alloc2048, model_decoder_layers_3_final_layer_norm_weight4, model_decoder_layers_3_final_layer_norm_bias4, alloc2049) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias4) model_decoder_layers_3_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] model_decoder_layers_3_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[580] gv2691: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2050: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2691, R.dtype("float16")) _2049: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_3_fc1_weight4, alloc2049, model_decoder_layers_3_fc1_bias4, alloc2050) R.vm.kill_object(alloc2049) R.vm.kill_object(model_decoder_layers_3_fc1_weight4) R.vm.kill_object(model_decoder_layers_3_fc1_bias4) model_decoder_layers_3_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] model_decoder_layers_3_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[582] gv2692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2051: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2692, R.dtype("float16")) _2050: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_3_fc2_weight4, alloc2050, model_decoder_layers_3_fc2_bias4, alloc2051) R.vm.kill_object(alloc2050) R.vm.kill_object(model_decoder_layers_3_fc2_weight4) R.vm.kill_object(model_decoder_layers_3_fc2_bias4) gv2693: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2052: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2693, R.dtype("float16")) cls.add5(alloc2048, alloc2051, alloc2052) R.vm.kill_object(alloc2048) R.vm.kill_object(alloc2051) model_decoder_layers_4_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[592] model_decoder_layers_4_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[593] gv2694: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2053: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2694, R.dtype("float16")) cls.layer_norm2(alloc2052, model_decoder_layers_4_self_attn_layer_norm_weight4, model_decoder_layers_4_self_attn_layer_norm_bias4, alloc2053) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias4) model_decoder_layers_4_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] model_decoder_layers_4_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[589] gv2695: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2054: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2695, R.dtype("float16")) _2053: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_q_proj_weight4, alloc2053, model_decoder_layers_4_self_attn_q_proj_bias4, alloc2054) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias4) gv2696: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1073: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2054, gv2696, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2054) model_decoder_layers_4_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] gv2697: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2055: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2697, R.dtype("float16")) _2054: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_4_self_attn_k_proj_weight4, alloc2053, alloc2055) R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight4) gv2698: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1074: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2055, gv2698, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2055) model_decoder_layers_4_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] model_decoder_layers_4_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[587] gv2699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2056: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2699, R.dtype("float16")) _2055: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_v_proj_weight4, alloc2053, model_decoder_layers_4_self_attn_v_proj_bias4, alloc2056) R.vm.kill_object(alloc2053) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias4) gv2700: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1075: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2056, gv2700, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2056) gv2701: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2057: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2701, R.dtype("float16")) cls.concatenate1(reshape1073, reshape1074, reshape1075, alloc2057) R.vm.kill_object(reshape1073) R.vm.kill_object(reshape1074) R.vm.kill_object(reshape1075) gv2702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1076: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2057, gv2702, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2057) gv2703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2058: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2703, R.dtype("float16")) _2057: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape1076, alloc2058) R.vm.kill_object(reshape1076) gv2704: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1077: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2058, gv2704, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2058) gv2705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1078: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1077, gv2705, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1077) model_decoder_layers_4_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] model_decoder_layers_4_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[591] gv2706: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2059: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2706, R.dtype("float16")) _2058: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_out_proj_weight4, reshape1078, model_decoder_layers_4_self_attn_out_proj_bias4, alloc2059) R.vm.kill_object(reshape1078) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias4) gv2707: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2060: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2707, R.dtype("float16")) cls.add5(alloc2052, alloc2059, alloc2060) R.vm.kill_object(alloc2052) R.vm.kill_object(alloc2059) model_decoder_layers_4_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[601] model_decoder_layers_4_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[602] gv2708: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2061: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2708, R.dtype("float16")) cls.layer_norm2(alloc2060, model_decoder_layers_4_encoder_attn_layer_norm_weight4, model_decoder_layers_4_encoder_attn_layer_norm_bias4, alloc2061) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias4) model_decoder_layers_4_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] model_decoder_layers_4_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[598] gv2709: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2062: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2709, R.dtype("float16")) _2061: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight4, alloc2061, model_decoder_layers_4_encoder_attn_q_proj_bias4, alloc2062) R.vm.kill_object(alloc2061) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias4) gv2710: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1079: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2062, gv2710, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2062) gv2711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1080: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1079, gv2711, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1079) gv2712: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2063: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2712, R.dtype("float16")) _2062: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape1080, alloc2063) R.vm.kill_object(reshape1080) gv2713: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1081: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2063, gv2713, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2063) gv2714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1082: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1081, gv2714, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1081) model_decoder_layers_4_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] model_decoder_layers_4_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[600] gv2715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2064: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2715, R.dtype("float16")) _2063: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight4, reshape1082, model_decoder_layers_4_encoder_attn_out_proj_bias4, alloc2064) R.vm.kill_object(reshape1082) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias4) gv2716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2065: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2716, R.dtype("float16")) cls.add5(alloc2060, alloc2064, alloc2065) R.vm.kill_object(alloc2060) R.vm.kill_object(alloc2064) model_decoder_layers_4_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[607] model_decoder_layers_4_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[608] gv2717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2066: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2717, R.dtype("float16")) cls.layer_norm2(alloc2065, model_decoder_layers_4_final_layer_norm_weight4, model_decoder_layers_4_final_layer_norm_bias4, alloc2066) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias4) model_decoder_layers_4_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] model_decoder_layers_4_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[604] gv2718: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2067: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2718, R.dtype("float16")) _2066: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_4_fc1_weight4, alloc2066, model_decoder_layers_4_fc1_bias4, alloc2067) R.vm.kill_object(alloc2066) R.vm.kill_object(model_decoder_layers_4_fc1_weight4) R.vm.kill_object(model_decoder_layers_4_fc1_bias4) model_decoder_layers_4_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] model_decoder_layers_4_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[606] gv2719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2068: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2719, R.dtype("float16")) _2067: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_4_fc2_weight4, alloc2067, model_decoder_layers_4_fc2_bias4, alloc2068) R.vm.kill_object(alloc2067) R.vm.kill_object(model_decoder_layers_4_fc2_weight4) R.vm.kill_object(model_decoder_layers_4_fc2_bias4) gv2720: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2069: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2720, R.dtype("float16")) cls.add5(alloc2065, alloc2068, alloc2069) R.vm.kill_object(alloc2065) R.vm.kill_object(alloc2068) model_decoder_layers_5_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[616] model_decoder_layers_5_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[617] gv2721: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2070: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2721, R.dtype("float16")) cls.layer_norm2(alloc2069, model_decoder_layers_5_self_attn_layer_norm_weight4, model_decoder_layers_5_self_attn_layer_norm_bias4, alloc2070) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias4) model_decoder_layers_5_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] model_decoder_layers_5_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[613] gv2722: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2071: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2722, R.dtype("float16")) _2070: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_q_proj_weight4, alloc2070, model_decoder_layers_5_self_attn_q_proj_bias4, alloc2071) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias4) gv2723: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1083: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2071, gv2723, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2071) model_decoder_layers_5_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] gv2724: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2072: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2724, R.dtype("float16")) _2071: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_5_self_attn_k_proj_weight4, alloc2070, alloc2072) R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight4) gv2725: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1084: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2072, gv2725, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2072) model_decoder_layers_5_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] model_decoder_layers_5_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[611] gv2726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2073: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2726, R.dtype("float16")) _2072: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_v_proj_weight4, alloc2070, model_decoder_layers_5_self_attn_v_proj_bias4, alloc2073) R.vm.kill_object(alloc2070) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias4) gv2727: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1085: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2073, gv2727, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2073) gv2728: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2074: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2728, R.dtype("float16")) cls.concatenate1(reshape1083, reshape1084, reshape1085, alloc2074) R.vm.kill_object(reshape1083) R.vm.kill_object(reshape1084) R.vm.kill_object(reshape1085) gv2729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1086: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2074, gv2729, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2074) gv2730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2075: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2730, R.dtype("float16")) _2074: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape1086, alloc2075) R.vm.kill_object(reshape1086) gv2731: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1087: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2075, gv2731, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2075) gv2732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1088: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1087, gv2732, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1087) model_decoder_layers_5_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] model_decoder_layers_5_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[615] gv2733: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2076: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2733, R.dtype("float16")) _2075: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_out_proj_weight4, reshape1088, model_decoder_layers_5_self_attn_out_proj_bias4, alloc2076) R.vm.kill_object(reshape1088) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias4) gv2734: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2077: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2734, R.dtype("float16")) cls.add5(alloc2069, alloc2076, alloc2077) R.vm.kill_object(alloc2069) R.vm.kill_object(alloc2076) model_decoder_layers_5_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[625] model_decoder_layers_5_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[626] gv2735: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2078: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2735, R.dtype("float16")) cls.layer_norm2(alloc2077, model_decoder_layers_5_encoder_attn_layer_norm_weight4, model_decoder_layers_5_encoder_attn_layer_norm_bias4, alloc2078) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias4) model_decoder_layers_5_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] model_decoder_layers_5_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[622] gv2736: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2079: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2736, R.dtype("float16")) _2078: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight4, alloc2078, model_decoder_layers_5_encoder_attn_q_proj_bias4, alloc2079) R.vm.kill_object(alloc2078) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias4) gv2737: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1089: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2079, gv2737, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2079) gv2738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1090: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1089, gv2738, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1089) gv2739: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2080: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2739, R.dtype("float16")) _2079: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape1090, alloc2080) R.vm.kill_object(reshape1090) gv2740: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1091: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2080, gv2740, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2080) gv2741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1092: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1091, gv2741, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1091) model_decoder_layers_5_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] model_decoder_layers_5_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[624] gv2742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2081: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2742, R.dtype("float16")) _2080: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight4, reshape1092, model_decoder_layers_5_encoder_attn_out_proj_bias4, alloc2081) R.vm.kill_object(reshape1092) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias4) gv2743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2082: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2743, R.dtype("float16")) cls.add5(alloc2077, alloc2081, alloc2082) R.vm.kill_object(alloc2077) R.vm.kill_object(alloc2081) model_decoder_layers_5_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[631] model_decoder_layers_5_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[632] gv2744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2083: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2744, R.dtype("float16")) cls.layer_norm2(alloc2082, model_decoder_layers_5_final_layer_norm_weight4, model_decoder_layers_5_final_layer_norm_bias4, alloc2083) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias4) model_decoder_layers_5_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] model_decoder_layers_5_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[628] gv2745: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2084: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2745, R.dtype("float16")) _2083: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_5_fc1_weight4, alloc2083, model_decoder_layers_5_fc1_bias4, alloc2084) R.vm.kill_object(alloc2083) R.vm.kill_object(model_decoder_layers_5_fc1_weight4) R.vm.kill_object(model_decoder_layers_5_fc1_bias4) model_decoder_layers_5_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] model_decoder_layers_5_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[630] gv2746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2085: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2746, R.dtype("float16")) _2084: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_5_fc2_weight4, alloc2084, model_decoder_layers_5_fc2_bias4, alloc2085) R.vm.kill_object(alloc2084) R.vm.kill_object(model_decoder_layers_5_fc2_weight4) R.vm.kill_object(model_decoder_layers_5_fc2_bias4) gv2747: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2086: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2747, R.dtype("float16")) cls.add5(alloc2082, alloc2085, alloc2086) R.vm.kill_object(alloc2082) R.vm.kill_object(alloc2085) model_decoder_layers_6_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[640] model_decoder_layers_6_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[641] gv2748: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2087: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2748, R.dtype("float16")) cls.layer_norm2(alloc2086, model_decoder_layers_6_self_attn_layer_norm_weight4, model_decoder_layers_6_self_attn_layer_norm_bias4, alloc2087) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias4) model_decoder_layers_6_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] model_decoder_layers_6_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[637] gv2749: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2088: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2749, R.dtype("float16")) _2087: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_q_proj_weight4, alloc2087, model_decoder_layers_6_self_attn_q_proj_bias4, alloc2088) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias4) gv2750: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1093: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2088, gv2750, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2088) model_decoder_layers_6_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] gv2751: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2089: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2751, R.dtype("float16")) _2088: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_6_self_attn_k_proj_weight4, alloc2087, alloc2089) R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight4) gv2752: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1094: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2089, gv2752, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2089) model_decoder_layers_6_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] model_decoder_layers_6_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[635] gv2753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2090: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2753, R.dtype("float16")) _2089: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_v_proj_weight4, alloc2087, model_decoder_layers_6_self_attn_v_proj_bias4, alloc2090) R.vm.kill_object(alloc2087) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias4) gv2754: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1095: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2090, gv2754, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2090) gv2755: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2091: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2755, R.dtype("float16")) cls.concatenate1(reshape1093, reshape1094, reshape1095, alloc2091) R.vm.kill_object(reshape1093) R.vm.kill_object(reshape1094) R.vm.kill_object(reshape1095) gv2756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1096: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2091, gv2756, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2091) gv2757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2092: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2757, R.dtype("float16")) _2091: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape1096, alloc2092) R.vm.kill_object(reshape1096) gv2758: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1097: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2092, gv2758, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2092) gv2759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1098: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1097, gv2759, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1097) model_decoder_layers_6_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] model_decoder_layers_6_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[639] gv2760: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2093: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2760, R.dtype("float16")) _2092: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_out_proj_weight4, reshape1098, model_decoder_layers_6_self_attn_out_proj_bias4, alloc2093) R.vm.kill_object(reshape1098) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias4) gv2761: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2094: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2761, R.dtype("float16")) cls.add5(alloc2086, alloc2093, alloc2094) R.vm.kill_object(alloc2086) R.vm.kill_object(alloc2093) model_decoder_layers_6_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[649] model_decoder_layers_6_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[650] gv2762: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2095: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2762, R.dtype("float16")) cls.layer_norm2(alloc2094, model_decoder_layers_6_encoder_attn_layer_norm_weight4, model_decoder_layers_6_encoder_attn_layer_norm_bias4, alloc2095) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias4) model_decoder_layers_6_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] model_decoder_layers_6_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[646] gv2763: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2096: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2763, R.dtype("float16")) _2095: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight4, alloc2095, model_decoder_layers_6_encoder_attn_q_proj_bias4, alloc2096) R.vm.kill_object(alloc2095) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias4) gv2764: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1099: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2096, gv2764, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2096) gv2765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1100: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1099, gv2765, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1099) gv2766: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2097: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2766, R.dtype("float16")) _2096: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape1100, alloc2097) R.vm.kill_object(reshape1100) gv2767: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1101: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2097, gv2767, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2097) gv2768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1102: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1101, gv2768, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1101) model_decoder_layers_6_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] model_decoder_layers_6_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[648] gv2769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2098: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2769, R.dtype("float16")) _2097: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight4, reshape1102, model_decoder_layers_6_encoder_attn_out_proj_bias4, alloc2098) R.vm.kill_object(reshape1102) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias4) gv2770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2099: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2770, R.dtype("float16")) cls.add5(alloc2094, alloc2098, alloc2099) R.vm.kill_object(alloc2094) R.vm.kill_object(alloc2098) model_decoder_layers_6_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[655] model_decoder_layers_6_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[656] gv2771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2771, R.dtype("float16")) cls.layer_norm2(alloc2099, model_decoder_layers_6_final_layer_norm_weight4, model_decoder_layers_6_final_layer_norm_bias4, alloc2100) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias4) model_decoder_layers_6_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] model_decoder_layers_6_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[652] gv2772: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2101: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2772, R.dtype("float16")) _2100: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_6_fc1_weight4, alloc2100, model_decoder_layers_6_fc1_bias4, alloc2101) R.vm.kill_object(alloc2100) R.vm.kill_object(model_decoder_layers_6_fc1_weight4) R.vm.kill_object(model_decoder_layers_6_fc1_bias4) model_decoder_layers_6_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] model_decoder_layers_6_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[654] gv2773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2773, R.dtype("float16")) _2101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_6_fc2_weight4, alloc2101, model_decoder_layers_6_fc2_bias4, alloc2102) R.vm.kill_object(alloc2101) R.vm.kill_object(model_decoder_layers_6_fc2_weight4) R.vm.kill_object(model_decoder_layers_6_fc2_bias4) gv2774: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2774, R.dtype("float16")) cls.add5(alloc2099, alloc2102, alloc2103) R.vm.kill_object(alloc2099) R.vm.kill_object(alloc2102) model_decoder_layers_7_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[664] model_decoder_layers_7_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[665] gv2775: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2775, R.dtype("float16")) cls.layer_norm2(alloc2103, model_decoder_layers_7_self_attn_layer_norm_weight4, model_decoder_layers_7_self_attn_layer_norm_bias4, alloc2104) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias4) model_decoder_layers_7_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] model_decoder_layers_7_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[661] gv2776: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2776, R.dtype("float16")) _2104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_q_proj_weight4, alloc2104, model_decoder_layers_7_self_attn_q_proj_bias4, alloc2105) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias4) gv2777: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1103: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2105, gv2777, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2105) model_decoder_layers_7_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] gv2778: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2778, R.dtype("float16")) _2105: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_7_self_attn_k_proj_weight4, alloc2104, alloc2106) R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight4) gv2779: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1104: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2106, gv2779, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2106) model_decoder_layers_7_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] model_decoder_layers_7_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[659] gv2780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2780, R.dtype("float16")) _2106: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_v_proj_weight4, alloc2104, model_decoder_layers_7_self_attn_v_proj_bias4, alloc2107) R.vm.kill_object(alloc2104) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias4) gv2781: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1105: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2107, gv2781, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2107) gv2782: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2108: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2782, R.dtype("float16")) cls.concatenate1(reshape1103, reshape1104, reshape1105, alloc2108) R.vm.kill_object(reshape1103) R.vm.kill_object(reshape1104) R.vm.kill_object(reshape1105) gv2783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1106: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2108, gv2783, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2108) gv2784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2784, R.dtype("float16")) _2108: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape1106, alloc2109) R.vm.kill_object(reshape1106) gv2785: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1107: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2109, gv2785, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2109) gv2786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1108: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1107, gv2786, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1107) model_decoder_layers_7_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] model_decoder_layers_7_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[663] gv2787: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2787, R.dtype("float16")) _2109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_out_proj_weight4, reshape1108, model_decoder_layers_7_self_attn_out_proj_bias4, alloc2110) R.vm.kill_object(reshape1108) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias4) gv2788: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2788, R.dtype("float16")) cls.add5(alloc2103, alloc2110, alloc2111) R.vm.kill_object(alloc2103) R.vm.kill_object(alloc2110) model_decoder_layers_7_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[673] model_decoder_layers_7_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[674] gv2789: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2789, R.dtype("float16")) cls.layer_norm2(alloc2111, model_decoder_layers_7_encoder_attn_layer_norm_weight4, model_decoder_layers_7_encoder_attn_layer_norm_bias4, alloc2112) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias4) model_decoder_layers_7_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] model_decoder_layers_7_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[670] gv2790: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2113: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2790, R.dtype("float16")) _2112: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight4, alloc2112, model_decoder_layers_7_encoder_attn_q_proj_bias4, alloc2113) R.vm.kill_object(alloc2112) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias4) gv2791: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1109: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2113, gv2791, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2113) gv2792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1110: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1109, gv2792, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1109) gv2793: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2793, R.dtype("float16")) _2113: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape1110, alloc2114) R.vm.kill_object(reshape1110) gv2794: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1111: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2114, gv2794, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2114) gv2795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1112: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1111, gv2795, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1111) model_decoder_layers_7_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] model_decoder_layers_7_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[672] gv2796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2796, R.dtype("float16")) _2114: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight4, reshape1112, model_decoder_layers_7_encoder_attn_out_proj_bias4, alloc2115) R.vm.kill_object(reshape1112) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias4) gv2797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2797, R.dtype("float16")) cls.add5(alloc2111, alloc2115, alloc2116) R.vm.kill_object(alloc2111) R.vm.kill_object(alloc2115) model_decoder_layers_7_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[679] model_decoder_layers_7_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[680] gv2798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2798, R.dtype("float16")) cls.layer_norm2(alloc2116, model_decoder_layers_7_final_layer_norm_weight4, model_decoder_layers_7_final_layer_norm_bias4, alloc2117) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias4) model_decoder_layers_7_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] model_decoder_layers_7_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[676] gv2799: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2118: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2799, R.dtype("float16")) _2117: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_7_fc1_weight4, alloc2117, model_decoder_layers_7_fc1_bias4, alloc2118) R.vm.kill_object(alloc2117) R.vm.kill_object(model_decoder_layers_7_fc1_weight4) R.vm.kill_object(model_decoder_layers_7_fc1_bias4) model_decoder_layers_7_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] model_decoder_layers_7_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[678] gv2800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2800, R.dtype("float16")) _2118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_7_fc2_weight4, alloc2118, model_decoder_layers_7_fc2_bias4, alloc2119) R.vm.kill_object(alloc2118) R.vm.kill_object(model_decoder_layers_7_fc2_weight4) R.vm.kill_object(model_decoder_layers_7_fc2_bias4) gv2801: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2801, R.dtype("float16")) cls.add5(alloc2116, alloc2119, alloc2120) R.vm.kill_object(alloc2116) R.vm.kill_object(alloc2119) model_decoder_layers_8_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[688] model_decoder_layers_8_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[689] gv2802: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2802, R.dtype("float16")) cls.layer_norm2(alloc2120, model_decoder_layers_8_self_attn_layer_norm_weight4, model_decoder_layers_8_self_attn_layer_norm_bias4, alloc2121) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias4) model_decoder_layers_8_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] model_decoder_layers_8_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[685] gv2803: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2803, R.dtype("float16")) _2121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_q_proj_weight4, alloc2121, model_decoder_layers_8_self_attn_q_proj_bias4, alloc2122) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias4) gv2804: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1113: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2122, gv2804, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2122) model_decoder_layers_8_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] gv2805: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2805, R.dtype("float16")) _2122: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_8_self_attn_k_proj_weight4, alloc2121, alloc2123) R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight4) gv2806: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1114: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2123, gv2806, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2123) model_decoder_layers_8_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] model_decoder_layers_8_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[683] gv2807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2807, R.dtype("float16")) _2123: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_v_proj_weight4, alloc2121, model_decoder_layers_8_self_attn_v_proj_bias4, alloc2124) R.vm.kill_object(alloc2121) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias4) gv2808: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1115: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2124, gv2808, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2124) gv2809: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2125: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2809, R.dtype("float16")) cls.concatenate1(reshape1113, reshape1114, reshape1115, alloc2125) R.vm.kill_object(reshape1113) R.vm.kill_object(reshape1114) R.vm.kill_object(reshape1115) gv2810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1116: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2125, gv2810, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2125) gv2811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2811, R.dtype("float16")) _2125: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape1116, alloc2126) R.vm.kill_object(reshape1116) gv2812: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1117: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2126, gv2812, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2126) gv2813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1118: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1117, gv2813, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1117) model_decoder_layers_8_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] model_decoder_layers_8_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[687] gv2814: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2814, R.dtype("float16")) _2126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_out_proj_weight4, reshape1118, model_decoder_layers_8_self_attn_out_proj_bias4, alloc2127) R.vm.kill_object(reshape1118) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias4) gv2815: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2815, R.dtype("float16")) cls.add5(alloc2120, alloc2127, alloc2128) R.vm.kill_object(alloc2120) R.vm.kill_object(alloc2127) model_decoder_layers_8_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[697] model_decoder_layers_8_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[698] gv2816: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2816, R.dtype("float16")) cls.layer_norm2(alloc2128, model_decoder_layers_8_encoder_attn_layer_norm_weight4, model_decoder_layers_8_encoder_attn_layer_norm_bias4, alloc2129) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias4) model_decoder_layers_8_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] model_decoder_layers_8_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[694] gv2817: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2130: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2817, R.dtype("float16")) _2129: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight4, alloc2129, model_decoder_layers_8_encoder_attn_q_proj_bias4, alloc2130) R.vm.kill_object(alloc2129) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias4) gv2818: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1119: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2130, gv2818, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2130) gv2819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1120: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1119, gv2819, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1119) gv2820: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2820, R.dtype("float16")) _2130: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape1120, alloc2131) R.vm.kill_object(reshape1120) gv2821: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1121: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2131, gv2821, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2131) gv2822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1122: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1121, gv2822, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1121) model_decoder_layers_8_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] model_decoder_layers_8_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[696] gv2823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2823, R.dtype("float16")) _2131: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight4, reshape1122, model_decoder_layers_8_encoder_attn_out_proj_bias4, alloc2132) R.vm.kill_object(reshape1122) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias4) gv2824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2824, R.dtype("float16")) cls.add5(alloc2128, alloc2132, alloc2133) R.vm.kill_object(alloc2128) R.vm.kill_object(alloc2132) model_decoder_layers_8_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[703] model_decoder_layers_8_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[704] gv2825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2825, R.dtype("float16")) cls.layer_norm2(alloc2133, model_decoder_layers_8_final_layer_norm_weight4, model_decoder_layers_8_final_layer_norm_bias4, alloc2134) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias4) model_decoder_layers_8_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] model_decoder_layers_8_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[700] gv2826: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2135: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2826, R.dtype("float16")) _2134: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_8_fc1_weight4, alloc2134, model_decoder_layers_8_fc1_bias4, alloc2135) R.vm.kill_object(alloc2134) R.vm.kill_object(model_decoder_layers_8_fc1_weight4) R.vm.kill_object(model_decoder_layers_8_fc1_bias4) model_decoder_layers_8_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] model_decoder_layers_8_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[702] gv2827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2827, R.dtype("float16")) _2135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_8_fc2_weight4, alloc2135, model_decoder_layers_8_fc2_bias4, alloc2136) R.vm.kill_object(alloc2135) R.vm.kill_object(model_decoder_layers_8_fc2_weight4) R.vm.kill_object(model_decoder_layers_8_fc2_bias4) gv2828: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2828, R.dtype("float16")) cls.add5(alloc2133, alloc2136, alloc2137) R.vm.kill_object(alloc2133) R.vm.kill_object(alloc2136) model_decoder_layers_9_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[712] model_decoder_layers_9_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[713] gv2829: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2829, R.dtype("float16")) cls.layer_norm2(alloc2137, model_decoder_layers_9_self_attn_layer_norm_weight4, model_decoder_layers_9_self_attn_layer_norm_bias4, alloc2138) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias4) model_decoder_layers_9_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] model_decoder_layers_9_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[709] gv2830: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2830, R.dtype("float16")) _2138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_q_proj_weight4, alloc2138, model_decoder_layers_9_self_attn_q_proj_bias4, alloc2139) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias4) gv2831: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1123: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2139, gv2831, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2139) model_decoder_layers_9_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] gv2832: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2832, R.dtype("float16")) _2139: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_9_self_attn_k_proj_weight4, alloc2138, alloc2140) R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight4) gv2833: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1124: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2140, gv2833, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2140) model_decoder_layers_9_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] model_decoder_layers_9_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[707] gv2834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2834, R.dtype("float16")) _2140: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_v_proj_weight4, alloc2138, model_decoder_layers_9_self_attn_v_proj_bias4, alloc2141) R.vm.kill_object(alloc2138) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias4) gv2835: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1125: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2141, gv2835, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2141) gv2836: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2142: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2836, R.dtype("float16")) cls.concatenate1(reshape1123, reshape1124, reshape1125, alloc2142) R.vm.kill_object(reshape1123) R.vm.kill_object(reshape1124) R.vm.kill_object(reshape1125) gv2837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1126: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2142, gv2837, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2142) gv2838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2838, R.dtype("float16")) _2142: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape1126, alloc2143) R.vm.kill_object(reshape1126) gv2839: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1127: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2143, gv2839, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2143) gv2840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1128: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1127, gv2840, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1127) model_decoder_layers_9_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] model_decoder_layers_9_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[711] gv2841: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2841, R.dtype("float16")) _2143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_out_proj_weight4, reshape1128, model_decoder_layers_9_self_attn_out_proj_bias4, alloc2144) R.vm.kill_object(reshape1128) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias4) gv2842: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2842, R.dtype("float16")) cls.add5(alloc2137, alloc2144, alloc2145) R.vm.kill_object(alloc2137) R.vm.kill_object(alloc2144) model_decoder_layers_9_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[721] model_decoder_layers_9_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[722] gv2843: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2843, R.dtype("float16")) cls.layer_norm2(alloc2145, model_decoder_layers_9_encoder_attn_layer_norm_weight4, model_decoder_layers_9_encoder_attn_layer_norm_bias4, alloc2146) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias4) model_decoder_layers_9_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] model_decoder_layers_9_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[718] gv2844: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2147: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2844, R.dtype("float16")) _2146: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight4, alloc2146, model_decoder_layers_9_encoder_attn_q_proj_bias4, alloc2147) R.vm.kill_object(alloc2146) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias4) gv2845: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1129: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2147, gv2845, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2147) gv2846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1130: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1129, gv2846, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1129) gv2847: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2847, R.dtype("float16")) _2147: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape1130, alloc2148) R.vm.kill_object(reshape1130) gv2848: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1131: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2148, gv2848, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2148) gv2849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1132: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1131, gv2849, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1131) model_decoder_layers_9_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] model_decoder_layers_9_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[720] gv2850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2850, R.dtype("float16")) _2148: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight4, reshape1132, model_decoder_layers_9_encoder_attn_out_proj_bias4, alloc2149) R.vm.kill_object(reshape1132) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias4) gv2851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2851, R.dtype("float16")) cls.add5(alloc2145, alloc2149, alloc2150) R.vm.kill_object(alloc2145) R.vm.kill_object(alloc2149) model_decoder_layers_9_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[727] model_decoder_layers_9_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[728] gv2852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2852, R.dtype("float16")) cls.layer_norm2(alloc2150, model_decoder_layers_9_final_layer_norm_weight4, model_decoder_layers_9_final_layer_norm_bias4, alloc2151) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias4) model_decoder_layers_9_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] model_decoder_layers_9_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[724] gv2853: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2152: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2853, R.dtype("float16")) _2151: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_9_fc1_weight4, alloc2151, model_decoder_layers_9_fc1_bias4, alloc2152) R.vm.kill_object(alloc2151) R.vm.kill_object(model_decoder_layers_9_fc1_weight4) R.vm.kill_object(model_decoder_layers_9_fc1_bias4) model_decoder_layers_9_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] model_decoder_layers_9_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[726] gv2854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2854, R.dtype("float16")) _2152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_9_fc2_weight4, alloc2152, model_decoder_layers_9_fc2_bias4, alloc2153) R.vm.kill_object(alloc2152) R.vm.kill_object(model_decoder_layers_9_fc2_weight4) R.vm.kill_object(model_decoder_layers_9_fc2_bias4) gv2855: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2855, R.dtype("float16")) cls.add5(alloc2150, alloc2153, alloc2154) R.vm.kill_object(alloc2150) R.vm.kill_object(alloc2153) model_decoder_layers_10_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[736] model_decoder_layers_10_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[737] gv2856: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2856, R.dtype("float16")) cls.layer_norm2(alloc2154, model_decoder_layers_10_self_attn_layer_norm_weight4, model_decoder_layers_10_self_attn_layer_norm_bias4, alloc2155) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias4) model_decoder_layers_10_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] model_decoder_layers_10_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[733] gv2857: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2857, R.dtype("float16")) _2155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_q_proj_weight4, alloc2155, model_decoder_layers_10_self_attn_q_proj_bias4, alloc2156) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias4) gv2858: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1133: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2156, gv2858, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2156) model_decoder_layers_10_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] gv2859: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2859, R.dtype("float16")) _2156: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_10_self_attn_k_proj_weight4, alloc2155, alloc2157) R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight4) gv2860: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1134: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2157, gv2860, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2157) model_decoder_layers_10_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] model_decoder_layers_10_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[731] gv2861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2861, R.dtype("float16")) _2157: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_v_proj_weight4, alloc2155, model_decoder_layers_10_self_attn_v_proj_bias4, alloc2158) R.vm.kill_object(alloc2155) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias4) gv2862: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1135: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2158, gv2862, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2158) gv2863: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2159: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2863, R.dtype("float16")) cls.concatenate1(reshape1133, reshape1134, reshape1135, alloc2159) R.vm.kill_object(reshape1133) R.vm.kill_object(reshape1134) R.vm.kill_object(reshape1135) gv2864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1136: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2159, gv2864, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2159) gv2865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2865, R.dtype("float16")) _2159: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape1136, alloc2160) R.vm.kill_object(reshape1136) gv2866: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1137: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2160, gv2866, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2160) gv2867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1138: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1137, gv2867, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1137) model_decoder_layers_10_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] model_decoder_layers_10_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[735] gv2868: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2868, R.dtype("float16")) _2160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_out_proj_weight4, reshape1138, model_decoder_layers_10_self_attn_out_proj_bias4, alloc2161) R.vm.kill_object(reshape1138) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias4) gv2869: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2869, R.dtype("float16")) cls.add5(alloc2154, alloc2161, alloc2162) R.vm.kill_object(alloc2154) R.vm.kill_object(alloc2161) model_decoder_layers_10_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[745] model_decoder_layers_10_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[746] gv2870: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2870, R.dtype("float16")) cls.layer_norm2(alloc2162, model_decoder_layers_10_encoder_attn_layer_norm_weight4, model_decoder_layers_10_encoder_attn_layer_norm_bias4, alloc2163) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias4) model_decoder_layers_10_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] model_decoder_layers_10_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[742] gv2871: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2164: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2871, R.dtype("float16")) _2163: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight4, alloc2163, model_decoder_layers_10_encoder_attn_q_proj_bias4, alloc2164) R.vm.kill_object(alloc2163) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias4) gv2872: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1139: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2164, gv2872, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2164) gv2873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1140: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1139, gv2873, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1139) gv2874: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2874, R.dtype("float16")) _2164: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape1140, alloc2165) R.vm.kill_object(reshape1140) gv2875: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1141: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2165, gv2875, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2165) gv2876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1142: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1141, gv2876, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1141) model_decoder_layers_10_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] model_decoder_layers_10_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[744] gv2877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2166: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2877, R.dtype("float16")) _2165: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight4, reshape1142, model_decoder_layers_10_encoder_attn_out_proj_bias4, alloc2166) R.vm.kill_object(reshape1142) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias4) gv2878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2167: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2878, R.dtype("float16")) cls.add5(alloc2162, alloc2166, alloc2167) R.vm.kill_object(alloc2162) R.vm.kill_object(alloc2166) model_decoder_layers_10_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[751] model_decoder_layers_10_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[752] gv2879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2168: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2879, R.dtype("float16")) cls.layer_norm2(alloc2167, model_decoder_layers_10_final_layer_norm_weight4, model_decoder_layers_10_final_layer_norm_bias4, alloc2168) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias4) model_decoder_layers_10_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] model_decoder_layers_10_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[748] gv2880: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2169: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2880, R.dtype("float16")) _2168: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_10_fc1_weight4, alloc2168, model_decoder_layers_10_fc1_bias4, alloc2169) R.vm.kill_object(alloc2168) R.vm.kill_object(model_decoder_layers_10_fc1_weight4) R.vm.kill_object(model_decoder_layers_10_fc1_bias4) model_decoder_layers_10_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] model_decoder_layers_10_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[750] gv2881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2170: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2881, R.dtype("float16")) _2169: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_10_fc2_weight4, alloc2169, model_decoder_layers_10_fc2_bias4, alloc2170) R.vm.kill_object(alloc2169) R.vm.kill_object(model_decoder_layers_10_fc2_weight4) R.vm.kill_object(model_decoder_layers_10_fc2_bias4) gv2882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2171: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2882, R.dtype("float16")) cls.add5(alloc2167, alloc2170, alloc2171) R.vm.kill_object(alloc2167) R.vm.kill_object(alloc2170) model_decoder_layers_11_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[760] model_decoder_layers_11_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[761] gv2883: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2172: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2883, R.dtype("float16")) cls.layer_norm2(alloc2171, model_decoder_layers_11_self_attn_layer_norm_weight4, model_decoder_layers_11_self_attn_layer_norm_bias4, alloc2172) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias4) model_decoder_layers_11_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] model_decoder_layers_11_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[757] gv2884: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2173: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2884, R.dtype("float16")) _2172: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_q_proj_weight4, alloc2172, model_decoder_layers_11_self_attn_q_proj_bias4, alloc2173) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias4) gv2885: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1143: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2173, gv2885, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2173) model_decoder_layers_11_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] gv2886: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2174: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2886, R.dtype("float16")) _2173: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_11_self_attn_k_proj_weight4, alloc2172, alloc2174) R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight4) gv2887: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1144: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2174, gv2887, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2174) model_decoder_layers_11_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] model_decoder_layers_11_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[755] gv2888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2175: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2888, R.dtype("float16")) _2174: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_v_proj_weight4, alloc2172, model_decoder_layers_11_self_attn_v_proj_bias4, alloc2175) R.vm.kill_object(alloc2172) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias4) gv2889: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1145: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2175, gv2889, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2175) gv2890: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2176: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2890, R.dtype("float16")) cls.concatenate1(reshape1143, reshape1144, reshape1145, alloc2176) R.vm.kill_object(reshape1143) R.vm.kill_object(reshape1144) R.vm.kill_object(reshape1145) gv2891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1146: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2176, gv2891, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2176) gv2892: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2177: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2892, R.dtype("float16")) _2176: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape1146, alloc2177) R.vm.kill_object(reshape1146) gv2893: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1147: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2177, gv2893, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2177) gv2894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1148: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1147, gv2894, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1147) model_decoder_layers_11_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] model_decoder_layers_11_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[759] gv2895: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2178: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2895, R.dtype("float16")) _2177: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_out_proj_weight4, reshape1148, model_decoder_layers_11_self_attn_out_proj_bias4, alloc2178) R.vm.kill_object(reshape1148) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias4) gv2896: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2179: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2896, R.dtype("float16")) cls.add5(alloc2171, alloc2178, alloc2179) R.vm.kill_object(alloc2171) R.vm.kill_object(alloc2178) model_decoder_layers_11_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[769] model_decoder_layers_11_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[770] gv2897: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2180: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2897, R.dtype("float16")) cls.layer_norm2(alloc2179, model_decoder_layers_11_encoder_attn_layer_norm_weight4, model_decoder_layers_11_encoder_attn_layer_norm_bias4, alloc2180) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias4) model_decoder_layers_11_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] model_decoder_layers_11_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[766] gv2898: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2181: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2898, R.dtype("float16")) _2180: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight4, alloc2180, model_decoder_layers_11_encoder_attn_q_proj_bias4, alloc2181) R.vm.kill_object(alloc2180) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias4) gv2899: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1149: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2181, gv2899, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2181) gv2900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1150: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1149, gv2900, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1149) gv2901: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2182: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2901, R.dtype("float16")) _2181: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape1150, alloc2182) R.vm.kill_object(reshape1150) gv2902: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1151: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2182, gv2902, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2182) gv2903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1152: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1151, gv2903, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1151) model_decoder_layers_11_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] model_decoder_layers_11_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[768] gv2904: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2183: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2904, R.dtype("float16")) _2182: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight4, reshape1152, model_decoder_layers_11_encoder_attn_out_proj_bias4, alloc2183) R.vm.kill_object(reshape1152) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias4) gv2905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2184: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2905, R.dtype("float16")) cls.add5(alloc2179, alloc2183, alloc2184) R.vm.kill_object(alloc2179) R.vm.kill_object(alloc2183) model_decoder_layers_11_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[775] model_decoder_layers_11_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[776] gv2906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2185: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2906, R.dtype("float16")) cls.layer_norm2(alloc2184, model_decoder_layers_11_final_layer_norm_weight4, model_decoder_layers_11_final_layer_norm_bias4, alloc2185) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias4) model_decoder_layers_11_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] model_decoder_layers_11_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[772] gv2907: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2186: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2907, R.dtype("float16")) _2185: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_11_fc1_weight4, alloc2185, model_decoder_layers_11_fc1_bias4, alloc2186) R.vm.kill_object(alloc2185) R.vm.kill_object(model_decoder_layers_11_fc1_weight4) R.vm.kill_object(model_decoder_layers_11_fc1_bias4) model_decoder_layers_11_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] model_decoder_layers_11_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[774] gv2908: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2187: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2908, R.dtype("float16")) _2186: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_11_fc2_weight4, alloc2186, model_decoder_layers_11_fc2_bias4, alloc2187) R.vm.kill_object(alloc2186) R.vm.kill_object(model_decoder_layers_11_fc2_weight4) R.vm.kill_object(model_decoder_layers_11_fc2_bias4) gv2909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2188: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2909, R.dtype("float16")) cls.add5(alloc2184, alloc2187, alloc2188) R.vm.kill_object(alloc2184) R.vm.kill_object(alloc2187) model_decoder_layers_12_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[784] model_decoder_layers_12_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[785] gv2910: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2189: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2910, R.dtype("float16")) cls.layer_norm2(alloc2188, model_decoder_layers_12_self_attn_layer_norm_weight4, model_decoder_layers_12_self_attn_layer_norm_bias4, alloc2189) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias4) model_decoder_layers_12_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] model_decoder_layers_12_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[781] gv2911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2190: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2911, R.dtype("float16")) _2189: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_q_proj_weight4, alloc2189, model_decoder_layers_12_self_attn_q_proj_bias4, alloc2190) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias4) gv2912: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1153: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2190, gv2912, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2190) model_decoder_layers_12_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] gv2913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2191: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2913, R.dtype("float16")) _2190: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_12_self_attn_k_proj_weight4, alloc2189, alloc2191) R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight4) gv2914: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1154: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2191, gv2914, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2191) model_decoder_layers_12_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] model_decoder_layers_12_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[779] gv2915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2192: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2915, R.dtype("float16")) _2191: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_v_proj_weight4, alloc2189, model_decoder_layers_12_self_attn_v_proj_bias4, alloc2192) R.vm.kill_object(alloc2189) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias4) gv2916: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1155: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2192, gv2916, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2192) gv2917: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2193: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2917, R.dtype("float16")) cls.concatenate1(reshape1153, reshape1154, reshape1155, alloc2193) R.vm.kill_object(reshape1153) R.vm.kill_object(reshape1154) R.vm.kill_object(reshape1155) gv2918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1156: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2193, gv2918, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2193) gv2919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2194: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2919, R.dtype("float16")) _2193: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape1156, alloc2194) R.vm.kill_object(reshape1156) gv2920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1157: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2194, gv2920, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2194) gv2921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1158: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1157, gv2921, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1157) model_decoder_layers_12_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] model_decoder_layers_12_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[783] gv2922: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2195: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2922, R.dtype("float16")) _2194: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_out_proj_weight4, reshape1158, model_decoder_layers_12_self_attn_out_proj_bias4, alloc2195) R.vm.kill_object(reshape1158) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias4) gv2923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2196: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2923, R.dtype("float16")) cls.add5(alloc2188, alloc2195, alloc2196) R.vm.kill_object(alloc2188) R.vm.kill_object(alloc2195) model_decoder_layers_12_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[793] model_decoder_layers_12_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[794] gv2924: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2197: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2924, R.dtype("float16")) cls.layer_norm2(alloc2196, model_decoder_layers_12_encoder_attn_layer_norm_weight4, model_decoder_layers_12_encoder_attn_layer_norm_bias4, alloc2197) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias4) model_decoder_layers_12_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] model_decoder_layers_12_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[790] gv2925: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2198: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2925, R.dtype("float16")) _2197: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight4, alloc2197, model_decoder_layers_12_encoder_attn_q_proj_bias4, alloc2198) R.vm.kill_object(alloc2197) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias4) gv2926: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1159: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2198, gv2926, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2198) gv2927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1160: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1159, gv2927, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1159) gv2928: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2199: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2928, R.dtype("float16")) _2198: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape1160, alloc2199) R.vm.kill_object(reshape1160) gv2929: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1161: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2199, gv2929, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2199) gv2930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1162: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1161, gv2930, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1161) model_decoder_layers_12_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] model_decoder_layers_12_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[792] gv2931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2200: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2931, R.dtype("float16")) _2199: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight4, reshape1162, model_decoder_layers_12_encoder_attn_out_proj_bias4, alloc2200) R.vm.kill_object(reshape1162) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias4) gv2932: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2201: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2932, R.dtype("float16")) cls.add5(alloc2196, alloc2200, alloc2201) R.vm.kill_object(alloc2196) R.vm.kill_object(alloc2200) model_decoder_layers_12_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[799] model_decoder_layers_12_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[800] gv2933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2202: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2933, R.dtype("float16")) cls.layer_norm2(alloc2201, model_decoder_layers_12_final_layer_norm_weight4, model_decoder_layers_12_final_layer_norm_bias4, alloc2202) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias4) model_decoder_layers_12_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] model_decoder_layers_12_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[796] gv2934: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2203: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2934, R.dtype("float16")) _2202: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_12_fc1_weight4, alloc2202, model_decoder_layers_12_fc1_bias4, alloc2203) R.vm.kill_object(alloc2202) R.vm.kill_object(model_decoder_layers_12_fc1_weight4) R.vm.kill_object(model_decoder_layers_12_fc1_bias4) model_decoder_layers_12_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] model_decoder_layers_12_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[798] gv2935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2204: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2935, R.dtype("float16")) _2203: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_12_fc2_weight4, alloc2203, model_decoder_layers_12_fc2_bias4, alloc2204) R.vm.kill_object(alloc2203) R.vm.kill_object(model_decoder_layers_12_fc2_weight4) R.vm.kill_object(model_decoder_layers_12_fc2_bias4) gv2936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2205: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2936, R.dtype("float16")) cls.add5(alloc2201, alloc2204, alloc2205) R.vm.kill_object(alloc2201) R.vm.kill_object(alloc2204) model_decoder_layers_13_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[808] model_decoder_layers_13_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[809] gv2937: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2206: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2937, R.dtype("float16")) cls.layer_norm2(alloc2205, model_decoder_layers_13_self_attn_layer_norm_weight4, model_decoder_layers_13_self_attn_layer_norm_bias4, alloc2206) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias4) model_decoder_layers_13_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] model_decoder_layers_13_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[805] gv2938: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2207: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2938, R.dtype("float16")) _2206: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_q_proj_weight4, alloc2206, model_decoder_layers_13_self_attn_q_proj_bias4, alloc2207) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias4) gv2939: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1163: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2207, gv2939, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2207) model_decoder_layers_13_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] gv2940: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2208: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2940, R.dtype("float16")) _2207: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_13_self_attn_k_proj_weight4, alloc2206, alloc2208) R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight4) gv2941: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1164: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2208, gv2941, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2208) model_decoder_layers_13_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] model_decoder_layers_13_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[803] gv2942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2209: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2942, R.dtype("float16")) _2208: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_v_proj_weight4, alloc2206, model_decoder_layers_13_self_attn_v_proj_bias4, alloc2209) R.vm.kill_object(alloc2206) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias4) gv2943: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1165: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2209, gv2943, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2209) gv2944: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2210: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2944, R.dtype("float16")) cls.concatenate1(reshape1163, reshape1164, reshape1165, alloc2210) R.vm.kill_object(reshape1163) R.vm.kill_object(reshape1164) R.vm.kill_object(reshape1165) gv2945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1166: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2210, gv2945, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2210) gv2946: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2211: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2946, R.dtype("float16")) _2210: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape1166, alloc2211) R.vm.kill_object(reshape1166) gv2947: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1167: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2211, gv2947, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2211) gv2948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1168: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1167, gv2948, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1167) model_decoder_layers_13_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] model_decoder_layers_13_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[807] gv2949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2212: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2949, R.dtype("float16")) _2211: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_out_proj_weight4, reshape1168, model_decoder_layers_13_self_attn_out_proj_bias4, alloc2212) R.vm.kill_object(reshape1168) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias4) gv2950: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2213: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2950, R.dtype("float16")) cls.add5(alloc2205, alloc2212, alloc2213) R.vm.kill_object(alloc2205) R.vm.kill_object(alloc2212) model_decoder_layers_13_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[817] model_decoder_layers_13_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[818] gv2951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2214: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2951, R.dtype("float16")) cls.layer_norm2(alloc2213, model_decoder_layers_13_encoder_attn_layer_norm_weight4, model_decoder_layers_13_encoder_attn_layer_norm_bias4, alloc2214) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias4) model_decoder_layers_13_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] model_decoder_layers_13_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[814] gv2952: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2215: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2952, R.dtype("float16")) _2214: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight4, alloc2214, model_decoder_layers_13_encoder_attn_q_proj_bias4, alloc2215) R.vm.kill_object(alloc2214) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias4) gv2953: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1169: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2215, gv2953, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2215) gv2954: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1170: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1169, gv2954, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1169) gv2955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2216: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2955, R.dtype("float16")) _2215: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape1170, alloc2216) R.vm.kill_object(reshape1170) gv2956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1171: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2216, gv2956, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2216) gv2957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1172: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1171, gv2957, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1171) model_decoder_layers_13_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] model_decoder_layers_13_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[816] gv2958: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2217: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2958, R.dtype("float16")) _2216: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight4, reshape1172, model_decoder_layers_13_encoder_attn_out_proj_bias4, alloc2217) R.vm.kill_object(reshape1172) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias4) gv2959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2218: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2959, R.dtype("float16")) cls.add5(alloc2213, alloc2217, alloc2218) R.vm.kill_object(alloc2213) R.vm.kill_object(alloc2217) model_decoder_layers_13_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[823] model_decoder_layers_13_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[824] gv2960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2219: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2960, R.dtype("float16")) cls.layer_norm2(alloc2218, model_decoder_layers_13_final_layer_norm_weight4, model_decoder_layers_13_final_layer_norm_bias4, alloc2219) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias4) model_decoder_layers_13_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] model_decoder_layers_13_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[820] gv2961: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2220: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2961, R.dtype("float16")) _2219: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_13_fc1_weight4, alloc2219, model_decoder_layers_13_fc1_bias4, alloc2220) R.vm.kill_object(alloc2219) R.vm.kill_object(model_decoder_layers_13_fc1_weight4) R.vm.kill_object(model_decoder_layers_13_fc1_bias4) model_decoder_layers_13_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] model_decoder_layers_13_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[822] gv2962: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2221: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2962, R.dtype("float16")) _2220: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_13_fc2_weight4, alloc2220, model_decoder_layers_13_fc2_bias4, alloc2221) R.vm.kill_object(alloc2220) R.vm.kill_object(model_decoder_layers_13_fc2_weight4) R.vm.kill_object(model_decoder_layers_13_fc2_bias4) gv2963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2222: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2963, R.dtype("float16")) cls.add5(alloc2218, alloc2221, alloc2222) R.vm.kill_object(alloc2218) R.vm.kill_object(alloc2221) model_decoder_layers_14_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[832] model_decoder_layers_14_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[833] gv2964: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2223: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2964, R.dtype("float16")) cls.layer_norm2(alloc2222, model_decoder_layers_14_self_attn_layer_norm_weight4, model_decoder_layers_14_self_attn_layer_norm_bias4, alloc2223) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias4) model_decoder_layers_14_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] model_decoder_layers_14_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[829] gv2965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2224: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2965, R.dtype("float16")) _2223: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_q_proj_weight4, alloc2223, model_decoder_layers_14_self_attn_q_proj_bias4, alloc2224) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias4) gv2966: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1173: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2224, gv2966, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2224) model_decoder_layers_14_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] gv2967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2225: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2967, R.dtype("float16")) _2224: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_14_self_attn_k_proj_weight4, alloc2223, alloc2225) R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight4) gv2968: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1174: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2225, gv2968, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2225) model_decoder_layers_14_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] model_decoder_layers_14_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[827] gv2969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2226: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2969, R.dtype("float16")) _2225: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_v_proj_weight4, alloc2223, model_decoder_layers_14_self_attn_v_proj_bias4, alloc2226) R.vm.kill_object(alloc2223) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias4) gv2970: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1175: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2226, gv2970, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2226) gv2971: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2227: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2971, R.dtype("float16")) cls.concatenate1(reshape1173, reshape1174, reshape1175, alloc2227) R.vm.kill_object(reshape1173) R.vm.kill_object(reshape1174) R.vm.kill_object(reshape1175) gv2972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1176: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2227, gv2972, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2227) gv2973: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2228: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2973, R.dtype("float16")) _2227: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape1176, alloc2228) R.vm.kill_object(reshape1176) gv2974: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1177: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2228, gv2974, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2228) gv2975: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1178: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1177, gv2975, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1177) model_decoder_layers_14_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] model_decoder_layers_14_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[831] gv2976: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2229: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2976, R.dtype("float16")) _2228: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_out_proj_weight4, reshape1178, model_decoder_layers_14_self_attn_out_proj_bias4, alloc2229) R.vm.kill_object(reshape1178) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias4) gv2977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2230: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2977, R.dtype("float16")) cls.add5(alloc2222, alloc2229, alloc2230) R.vm.kill_object(alloc2222) R.vm.kill_object(alloc2229) model_decoder_layers_14_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[841] model_decoder_layers_14_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[842] gv2978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2231: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2978, R.dtype("float16")) cls.layer_norm2(alloc2230, model_decoder_layers_14_encoder_attn_layer_norm_weight4, model_decoder_layers_14_encoder_attn_layer_norm_bias4, alloc2231) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias4) model_decoder_layers_14_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] model_decoder_layers_14_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[838] gv2979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2232: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2979, R.dtype("float16")) _2231: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight4, alloc2231, model_decoder_layers_14_encoder_attn_q_proj_bias4, alloc2232) R.vm.kill_object(alloc2231) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias4) gv2980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1179: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2232, gv2980, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2232) gv2981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1180: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1179, gv2981, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1179) gv2982: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2233: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2982, R.dtype("float16")) _2232: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape1180, alloc2233) R.vm.kill_object(reshape1180) gv2983: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1181: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2233, gv2983, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2233) gv2984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1182: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1181, gv2984, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1181) model_decoder_layers_14_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] model_decoder_layers_14_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[840] gv2985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2234: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2985, R.dtype("float16")) _2233: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight4, reshape1182, model_decoder_layers_14_encoder_attn_out_proj_bias4, alloc2234) R.vm.kill_object(reshape1182) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias4) gv2986: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2235: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2986, R.dtype("float16")) cls.add5(alloc2230, alloc2234, alloc2235) R.vm.kill_object(alloc2230) R.vm.kill_object(alloc2234) model_decoder_layers_14_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[847] model_decoder_layers_14_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[848] gv2987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2236: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2987, R.dtype("float16")) cls.layer_norm2(alloc2235, model_decoder_layers_14_final_layer_norm_weight4, model_decoder_layers_14_final_layer_norm_bias4, alloc2236) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias4) model_decoder_layers_14_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] model_decoder_layers_14_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[844] gv2988: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2237: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2988, R.dtype("float16")) _2236: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_14_fc1_weight4, alloc2236, model_decoder_layers_14_fc1_bias4, alloc2237) R.vm.kill_object(alloc2236) R.vm.kill_object(model_decoder_layers_14_fc1_weight4) R.vm.kill_object(model_decoder_layers_14_fc1_bias4) model_decoder_layers_14_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] model_decoder_layers_14_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[846] gv2989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2238: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2989, R.dtype("float16")) _2237: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_14_fc2_weight4, alloc2237, model_decoder_layers_14_fc2_bias4, alloc2238) R.vm.kill_object(alloc2237) R.vm.kill_object(model_decoder_layers_14_fc2_weight4) R.vm.kill_object(model_decoder_layers_14_fc2_bias4) gv2990: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2239: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2990, R.dtype("float16")) cls.add5(alloc2235, alloc2238, alloc2239) R.vm.kill_object(alloc2235) R.vm.kill_object(alloc2238) model_decoder_layers_15_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[856] model_decoder_layers_15_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[857] gv2991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2240: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2991, R.dtype("float16")) cls.layer_norm2(alloc2239, model_decoder_layers_15_self_attn_layer_norm_weight4, model_decoder_layers_15_self_attn_layer_norm_bias4, alloc2240) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias4) model_decoder_layers_15_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] model_decoder_layers_15_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[853] gv2992: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2241: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2992, R.dtype("float16")) _2240: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_q_proj_weight4, alloc2240, model_decoder_layers_15_self_attn_q_proj_bias4, alloc2241) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias4) gv2993: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1183: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2241, gv2993, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2241) model_decoder_layers_15_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] gv2994: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2242: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2994, R.dtype("float16")) _2241: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_15_self_attn_k_proj_weight4, alloc2240, alloc2242) R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight4) gv2995: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1184: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2242, gv2995, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2242) model_decoder_layers_15_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] model_decoder_layers_15_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[851] gv2996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2243: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2996, R.dtype("float16")) _2242: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_v_proj_weight4, alloc2240, model_decoder_layers_15_self_attn_v_proj_bias4, alloc2243) R.vm.kill_object(alloc2240) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias4) gv2997: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1185: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2243, gv2997, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2243) gv2998: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2244: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2998, R.dtype("float16")) cls.concatenate1(reshape1183, reshape1184, reshape1185, alloc2244) R.vm.kill_object(reshape1183) R.vm.kill_object(reshape1184) R.vm.kill_object(reshape1185) gv2999: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1186: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2244, gv2999, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2244) gv3000: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2245: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3000, R.dtype("float16")) _2244: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape1186, alloc2245) R.vm.kill_object(reshape1186) gv3001: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1187: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2245, gv3001, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2245) gv3002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1188: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1187, gv3002, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1187) model_decoder_layers_15_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] model_decoder_layers_15_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[855] gv3003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2246: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3003, R.dtype("float16")) _2245: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_out_proj_weight4, reshape1188, model_decoder_layers_15_self_attn_out_proj_bias4, alloc2246) R.vm.kill_object(reshape1188) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias4) gv3004: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2247: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3004, R.dtype("float16")) cls.add5(alloc2239, alloc2246, alloc2247) R.vm.kill_object(alloc2239) R.vm.kill_object(alloc2246) model_decoder_layers_15_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[865] model_decoder_layers_15_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[866] gv3005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2248: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3005, R.dtype("float16")) cls.layer_norm2(alloc2247, model_decoder_layers_15_encoder_attn_layer_norm_weight4, model_decoder_layers_15_encoder_attn_layer_norm_bias4, alloc2248) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias4) model_decoder_layers_15_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] model_decoder_layers_15_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[862] gv3006: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2249: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3006, R.dtype("float16")) _2248: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight4, alloc2248, model_decoder_layers_15_encoder_attn_q_proj_bias4, alloc2249) R.vm.kill_object(alloc2248) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias4) gv3007: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1189: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2249, gv3007, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2249) gv3008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1190: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1189, gv3008, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1189) gv3009: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2250: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3009, R.dtype("float16")) _2249: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape1190, alloc2250) R.vm.kill_object(reshape1190) gv3010: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1191: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2250, gv3010, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2250) gv3011: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1192: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1191, gv3011, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1191) model_decoder_layers_15_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] model_decoder_layers_15_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[864] gv3012: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2251: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3012, R.dtype("float16")) _2250: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight4, reshape1192, model_decoder_layers_15_encoder_attn_out_proj_bias4, alloc2251) R.vm.kill_object(reshape1192) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias4) gv3013: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2252: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3013, R.dtype("float16")) cls.add5(alloc2247, alloc2251, alloc2252) R.vm.kill_object(alloc2247) R.vm.kill_object(alloc2251) model_decoder_layers_15_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[871] model_decoder_layers_15_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[872] gv3014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2253: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3014, R.dtype("float16")) cls.layer_norm2(alloc2252, model_decoder_layers_15_final_layer_norm_weight4, model_decoder_layers_15_final_layer_norm_bias4, alloc2253) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias4) model_decoder_layers_15_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] model_decoder_layers_15_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[868] gv3015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2254: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3015, R.dtype("float16")) _2253: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_15_fc1_weight4, alloc2253, model_decoder_layers_15_fc1_bias4, alloc2254) R.vm.kill_object(alloc2253) R.vm.kill_object(model_decoder_layers_15_fc1_weight4) R.vm.kill_object(model_decoder_layers_15_fc1_bias4) model_decoder_layers_15_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] model_decoder_layers_15_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[870] gv3016: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2255: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3016, R.dtype("float16")) _2254: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_15_fc2_weight4, alloc2254, model_decoder_layers_15_fc2_bias4, alloc2255) R.vm.kill_object(alloc2254) R.vm.kill_object(model_decoder_layers_15_fc2_weight4) R.vm.kill_object(model_decoder_layers_15_fc2_bias4) gv3017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2256: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3017, R.dtype("float16")) cls.add5(alloc2252, alloc2255, alloc2256) R.vm.kill_object(alloc2252) R.vm.kill_object(alloc2255) model_decoder_layers_16_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[880] model_decoder_layers_16_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[881] gv3018: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2257: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3018, R.dtype("float16")) cls.layer_norm2(alloc2256, model_decoder_layers_16_self_attn_layer_norm_weight4, model_decoder_layers_16_self_attn_layer_norm_bias4, alloc2257) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias4) model_decoder_layers_16_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] model_decoder_layers_16_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[877] gv3019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2258: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3019, R.dtype("float16")) _2257: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_q_proj_weight4, alloc2257, model_decoder_layers_16_self_attn_q_proj_bias4, alloc2258) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias4) gv3020: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1193: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2258, gv3020, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2258) model_decoder_layers_16_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] gv3021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2259: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3021, R.dtype("float16")) _2258: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_16_self_attn_k_proj_weight4, alloc2257, alloc2259) R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight4) gv3022: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1194: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2259, gv3022, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2259) model_decoder_layers_16_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] model_decoder_layers_16_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[875] gv3023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2260: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3023, R.dtype("float16")) _2259: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_v_proj_weight4, alloc2257, model_decoder_layers_16_self_attn_v_proj_bias4, alloc2260) R.vm.kill_object(alloc2257) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias4) gv3024: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1195: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2260, gv3024, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2260) gv3025: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2261: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3025, R.dtype("float16")) cls.concatenate1(reshape1193, reshape1194, reshape1195, alloc2261) R.vm.kill_object(reshape1193) R.vm.kill_object(reshape1194) R.vm.kill_object(reshape1195) gv3026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1196: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2261, gv3026, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2261) gv3027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2262: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3027, R.dtype("float16")) _2261: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape1196, alloc2262) R.vm.kill_object(reshape1196) gv3028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1197: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2262, gv3028, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2262) gv3029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1198: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1197, gv3029, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1197) model_decoder_layers_16_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] model_decoder_layers_16_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[879] gv3030: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2263: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3030, R.dtype("float16")) _2262: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_out_proj_weight4, reshape1198, model_decoder_layers_16_self_attn_out_proj_bias4, alloc2263) R.vm.kill_object(reshape1198) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias4) gv3031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2264: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3031, R.dtype("float16")) cls.add5(alloc2256, alloc2263, alloc2264) R.vm.kill_object(alloc2256) R.vm.kill_object(alloc2263) model_decoder_layers_16_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[889] model_decoder_layers_16_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[890] gv3032: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2265: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3032, R.dtype("float16")) cls.layer_norm2(alloc2264, model_decoder_layers_16_encoder_attn_layer_norm_weight4, model_decoder_layers_16_encoder_attn_layer_norm_bias4, alloc2265) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias4) model_decoder_layers_16_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] model_decoder_layers_16_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[886] gv3033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2266: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3033, R.dtype("float16")) _2265: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight4, alloc2265, model_decoder_layers_16_encoder_attn_q_proj_bias4, alloc2266) R.vm.kill_object(alloc2265) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias4) gv3034: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1199: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2266, gv3034, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2266) gv3035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1200: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1199, gv3035, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1199) gv3036: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2267: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3036, R.dtype("float16")) _2266: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape1200, alloc2267) R.vm.kill_object(reshape1200) gv3037: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1201: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2267, gv3037, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2267) gv3038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1202: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1201, gv3038, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1201) model_decoder_layers_16_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] model_decoder_layers_16_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[888] gv3039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2268: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3039, R.dtype("float16")) _2267: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight4, reshape1202, model_decoder_layers_16_encoder_attn_out_proj_bias4, alloc2268) R.vm.kill_object(reshape1202) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias4) gv3040: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2269: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3040, R.dtype("float16")) cls.add5(alloc2264, alloc2268, alloc2269) R.vm.kill_object(alloc2264) R.vm.kill_object(alloc2268) model_decoder_layers_16_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[895] model_decoder_layers_16_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[896] gv3041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2270: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3041, R.dtype("float16")) cls.layer_norm2(alloc2269, model_decoder_layers_16_final_layer_norm_weight4, model_decoder_layers_16_final_layer_norm_bias4, alloc2270) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias4) model_decoder_layers_16_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] model_decoder_layers_16_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[892] gv3042: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2271: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3042, R.dtype("float16")) _2270: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_16_fc1_weight4, alloc2270, model_decoder_layers_16_fc1_bias4, alloc2271) R.vm.kill_object(alloc2270) R.vm.kill_object(model_decoder_layers_16_fc1_weight4) R.vm.kill_object(model_decoder_layers_16_fc1_bias4) model_decoder_layers_16_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] model_decoder_layers_16_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[894] gv3043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2272: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3043, R.dtype("float16")) _2271: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_16_fc2_weight4, alloc2271, model_decoder_layers_16_fc2_bias4, alloc2272) R.vm.kill_object(alloc2271) R.vm.kill_object(model_decoder_layers_16_fc2_weight4) R.vm.kill_object(model_decoder_layers_16_fc2_bias4) gv3044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2273: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3044, R.dtype("float16")) cls.add5(alloc2269, alloc2272, alloc2273) R.vm.kill_object(alloc2269) R.vm.kill_object(alloc2272) model_decoder_layers_17_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[904] model_decoder_layers_17_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[905] gv3045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2274: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3045, R.dtype("float16")) cls.layer_norm2(alloc2273, model_decoder_layers_17_self_attn_layer_norm_weight4, model_decoder_layers_17_self_attn_layer_norm_bias4, alloc2274) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias4) model_decoder_layers_17_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] model_decoder_layers_17_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[901] gv3046: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2275: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3046, R.dtype("float16")) _2274: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_q_proj_weight4, alloc2274, model_decoder_layers_17_self_attn_q_proj_bias4, alloc2275) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias4) gv3047: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1203: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2275, gv3047, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2275) model_decoder_layers_17_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] gv3048: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2276: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3048, R.dtype("float16")) _2275: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_17_self_attn_k_proj_weight4, alloc2274, alloc2276) R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight4) gv3049: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1204: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2276, gv3049, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2276) model_decoder_layers_17_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] model_decoder_layers_17_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[899] gv3050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2277: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3050, R.dtype("float16")) _2276: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_v_proj_weight4, alloc2274, model_decoder_layers_17_self_attn_v_proj_bias4, alloc2277) R.vm.kill_object(alloc2274) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias4) gv3051: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1205: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2277, gv3051, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2277) gv3052: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2278: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3052, R.dtype("float16")) cls.concatenate1(reshape1203, reshape1204, reshape1205, alloc2278) R.vm.kill_object(reshape1203) R.vm.kill_object(reshape1204) R.vm.kill_object(reshape1205) gv3053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1206: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2278, gv3053, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2278) gv3054: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2279: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3054, R.dtype("float16")) _2278: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape1206, alloc2279) R.vm.kill_object(reshape1206) gv3055: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1207: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2279, gv3055, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2279) gv3056: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1208: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1207, gv3056, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1207) model_decoder_layers_17_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] model_decoder_layers_17_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[903] gv3057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2280: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3057, R.dtype("float16")) _2279: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_out_proj_weight4, reshape1208, model_decoder_layers_17_self_attn_out_proj_bias4, alloc2280) R.vm.kill_object(reshape1208) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias4) gv3058: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2281: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3058, R.dtype("float16")) cls.add5(alloc2273, alloc2280, alloc2281) R.vm.kill_object(alloc2273) R.vm.kill_object(alloc2280) model_decoder_layers_17_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[913] model_decoder_layers_17_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[914] gv3059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2282: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3059, R.dtype("float16")) cls.layer_norm2(alloc2281, model_decoder_layers_17_encoder_attn_layer_norm_weight4, model_decoder_layers_17_encoder_attn_layer_norm_bias4, alloc2282) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias4) model_decoder_layers_17_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] model_decoder_layers_17_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[910] gv3060: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2283: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3060, R.dtype("float16")) _2282: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight4, alloc2282, model_decoder_layers_17_encoder_attn_q_proj_bias4, alloc2283) R.vm.kill_object(alloc2282) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias4) gv3061: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1209: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2283, gv3061, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2283) gv3062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1210: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1209, gv3062, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1209) gv3063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2284: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3063, R.dtype("float16")) _2283: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape1210, alloc2284) R.vm.kill_object(reshape1210) gv3064: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1211: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2284, gv3064, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2284) gv3065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1212: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1211, gv3065, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1211) model_decoder_layers_17_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] model_decoder_layers_17_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[912] gv3066: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2285: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3066, R.dtype("float16")) _2284: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight4, reshape1212, model_decoder_layers_17_encoder_attn_out_proj_bias4, alloc2285) R.vm.kill_object(reshape1212) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias4) gv3067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2286: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3067, R.dtype("float16")) cls.add5(alloc2281, alloc2285, alloc2286) R.vm.kill_object(alloc2281) R.vm.kill_object(alloc2285) model_decoder_layers_17_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[919] model_decoder_layers_17_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[920] gv3068: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2287: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3068, R.dtype("float16")) cls.layer_norm2(alloc2286, model_decoder_layers_17_final_layer_norm_weight4, model_decoder_layers_17_final_layer_norm_bias4, alloc2287) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias4) model_decoder_layers_17_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] model_decoder_layers_17_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[916] gv3069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2288: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3069, R.dtype("float16")) _2287: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_17_fc1_weight4, alloc2287, model_decoder_layers_17_fc1_bias4, alloc2288) R.vm.kill_object(alloc2287) R.vm.kill_object(model_decoder_layers_17_fc1_weight4) R.vm.kill_object(model_decoder_layers_17_fc1_bias4) model_decoder_layers_17_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] model_decoder_layers_17_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[918] gv3070: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2289: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3070, R.dtype("float16")) _2288: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_17_fc2_weight4, alloc2288, model_decoder_layers_17_fc2_bias4, alloc2289) R.vm.kill_object(alloc2288) R.vm.kill_object(model_decoder_layers_17_fc2_weight4) R.vm.kill_object(model_decoder_layers_17_fc2_bias4) gv3071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2290: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3071, R.dtype("float16")) cls.add5(alloc2286, alloc2289, alloc2290) R.vm.kill_object(alloc2286) R.vm.kill_object(alloc2289) model_decoder_layers_18_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[928] model_decoder_layers_18_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[929] gv3072: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2291: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3072, R.dtype("float16")) cls.layer_norm2(alloc2290, model_decoder_layers_18_self_attn_layer_norm_weight4, model_decoder_layers_18_self_attn_layer_norm_bias4, alloc2291) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias4) model_decoder_layers_18_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] model_decoder_layers_18_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[925] gv3073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2292: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3073, R.dtype("float16")) _2291: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_q_proj_weight4, alloc2291, model_decoder_layers_18_self_attn_q_proj_bias4, alloc2292) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias4) gv3074: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1213: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2292, gv3074, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2292) model_decoder_layers_18_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] gv3075: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2293: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3075, R.dtype("float16")) _2292: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_18_self_attn_k_proj_weight4, alloc2291, alloc2293) R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight4) gv3076: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1214: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2293, gv3076, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2293) model_decoder_layers_18_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] model_decoder_layers_18_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[923] gv3077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2294: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3077, R.dtype("float16")) _2293: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_v_proj_weight4, alloc2291, model_decoder_layers_18_self_attn_v_proj_bias4, alloc2294) R.vm.kill_object(alloc2291) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias4) gv3078: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1215: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2294, gv3078, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2294) gv3079: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2295: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3079, R.dtype("float16")) cls.concatenate1(reshape1213, reshape1214, reshape1215, alloc2295) R.vm.kill_object(reshape1213) R.vm.kill_object(reshape1214) R.vm.kill_object(reshape1215) gv3080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1216: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2295, gv3080, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2295) gv3081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2296: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3081, R.dtype("float16")) _2295: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape1216, alloc2296) R.vm.kill_object(reshape1216) gv3082: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1217: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2296, gv3082, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2296) gv3083: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1218: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1217, gv3083, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1217) model_decoder_layers_18_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] model_decoder_layers_18_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[927] gv3084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2297: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3084, R.dtype("float16")) _2296: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_out_proj_weight4, reshape1218, model_decoder_layers_18_self_attn_out_proj_bias4, alloc2297) R.vm.kill_object(reshape1218) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias4) gv3085: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2298: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3085, R.dtype("float16")) cls.add5(alloc2290, alloc2297, alloc2298) R.vm.kill_object(alloc2290) R.vm.kill_object(alloc2297) model_decoder_layers_18_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[937] model_decoder_layers_18_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[938] gv3086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2299: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3086, R.dtype("float16")) cls.layer_norm2(alloc2298, model_decoder_layers_18_encoder_attn_layer_norm_weight4, model_decoder_layers_18_encoder_attn_layer_norm_bias4, alloc2299) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias4) model_decoder_layers_18_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] model_decoder_layers_18_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[934] gv3087: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2300: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3087, R.dtype("float16")) _2299: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight4, alloc2299, model_decoder_layers_18_encoder_attn_q_proj_bias4, alloc2300) R.vm.kill_object(alloc2299) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias4) gv3088: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1219: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2300, gv3088, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2300) gv3089: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1220: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1219, gv3089, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1219) gv3090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2301: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3090, R.dtype("float16")) _2300: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape1220, alloc2301) R.vm.kill_object(reshape1220) gv3091: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1221: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2301, gv3091, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2301) gv3092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1222: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1221, gv3092, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1221) model_decoder_layers_18_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] model_decoder_layers_18_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[936] gv3093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2302: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3093, R.dtype("float16")) _2301: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight4, reshape1222, model_decoder_layers_18_encoder_attn_out_proj_bias4, alloc2302) R.vm.kill_object(reshape1222) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias4) gv3094: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2303: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3094, R.dtype("float16")) cls.add5(alloc2298, alloc2302, alloc2303) R.vm.kill_object(alloc2298) R.vm.kill_object(alloc2302) model_decoder_layers_18_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[943] model_decoder_layers_18_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[944] gv3095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2304: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3095, R.dtype("float16")) cls.layer_norm2(alloc2303, model_decoder_layers_18_final_layer_norm_weight4, model_decoder_layers_18_final_layer_norm_bias4, alloc2304) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias4) model_decoder_layers_18_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] model_decoder_layers_18_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[940] gv3096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2305: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3096, R.dtype("float16")) _2304: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_18_fc1_weight4, alloc2304, model_decoder_layers_18_fc1_bias4, alloc2305) R.vm.kill_object(alloc2304) R.vm.kill_object(model_decoder_layers_18_fc1_weight4) R.vm.kill_object(model_decoder_layers_18_fc1_bias4) model_decoder_layers_18_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] model_decoder_layers_18_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[942] gv3097: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2306: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3097, R.dtype("float16")) _2305: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_18_fc2_weight4, alloc2305, model_decoder_layers_18_fc2_bias4, alloc2306) R.vm.kill_object(alloc2305) R.vm.kill_object(model_decoder_layers_18_fc2_weight4) R.vm.kill_object(model_decoder_layers_18_fc2_bias4) gv3098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2307: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3098, R.dtype("float16")) cls.add5(alloc2303, alloc2306, alloc2307) R.vm.kill_object(alloc2303) R.vm.kill_object(alloc2306) model_decoder_layers_19_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[952] model_decoder_layers_19_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[953] gv3099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2308: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3099, R.dtype("float16")) cls.layer_norm2(alloc2307, model_decoder_layers_19_self_attn_layer_norm_weight4, model_decoder_layers_19_self_attn_layer_norm_bias4, alloc2308) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias4) model_decoder_layers_19_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] model_decoder_layers_19_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[949] gv3100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2309: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3100, R.dtype("float16")) _2308: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_q_proj_weight4, alloc2308, model_decoder_layers_19_self_attn_q_proj_bias4, alloc2309) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias4) gv3101: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1223: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2309, gv3101, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2309) model_decoder_layers_19_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] gv3102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2310: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3102, R.dtype("float16")) _2309: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_19_self_attn_k_proj_weight4, alloc2308, alloc2310) R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight4) gv3103: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1224: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2310, gv3103, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2310) model_decoder_layers_19_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] model_decoder_layers_19_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[947] gv3104: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2311: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3104, R.dtype("float16")) _2310: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_v_proj_weight4, alloc2308, model_decoder_layers_19_self_attn_v_proj_bias4, alloc2311) R.vm.kill_object(alloc2308) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias4) gv3105: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1225: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2311, gv3105, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2311) gv3106: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2312: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3106, R.dtype("float16")) cls.concatenate1(reshape1223, reshape1224, reshape1225, alloc2312) R.vm.kill_object(reshape1223) R.vm.kill_object(reshape1224) R.vm.kill_object(reshape1225) gv3107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1226: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2312, gv3107, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2312) gv3108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2313: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3108, R.dtype("float16")) _2312: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape1226, alloc2313) R.vm.kill_object(reshape1226) gv3109: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1227: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2313, gv3109, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2313) gv3110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1228: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1227, gv3110, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1227) model_decoder_layers_19_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] model_decoder_layers_19_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[951] gv3111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2314: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3111, R.dtype("float16")) _2313: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_out_proj_weight4, reshape1228, model_decoder_layers_19_self_attn_out_proj_bias4, alloc2314) R.vm.kill_object(reshape1228) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias4) gv3112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2315: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3112, R.dtype("float16")) cls.add5(alloc2307, alloc2314, alloc2315) R.vm.kill_object(alloc2307) R.vm.kill_object(alloc2314) model_decoder_layers_19_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[961] model_decoder_layers_19_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[962] gv3113: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2316: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3113, R.dtype("float16")) cls.layer_norm2(alloc2315, model_decoder_layers_19_encoder_attn_layer_norm_weight4, model_decoder_layers_19_encoder_attn_layer_norm_bias4, alloc2316) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias4) model_decoder_layers_19_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] model_decoder_layers_19_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[958] gv3114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2317: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3114, R.dtype("float16")) _2316: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight4, alloc2316, model_decoder_layers_19_encoder_attn_q_proj_bias4, alloc2317) R.vm.kill_object(alloc2316) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias4) gv3115: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1229: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2317, gv3115, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2317) gv3116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1230: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1229, gv3116, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1229) gv3117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2318: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3117, R.dtype("float16")) _2317: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape1230, alloc2318) R.vm.kill_object(reshape1230) gv3118: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1231: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2318, gv3118, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2318) gv3119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1232: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1231, gv3119, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1231) model_decoder_layers_19_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] model_decoder_layers_19_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[960] gv3120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2319: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3120, R.dtype("float16")) _2318: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight4, reshape1232, model_decoder_layers_19_encoder_attn_out_proj_bias4, alloc2319) R.vm.kill_object(reshape1232) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias4) gv3121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2320: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3121, R.dtype("float16")) cls.add5(alloc2315, alloc2319, alloc2320) R.vm.kill_object(alloc2315) R.vm.kill_object(alloc2319) model_decoder_layers_19_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[967] model_decoder_layers_19_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[968] gv3122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2321: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3122, R.dtype("float16")) cls.layer_norm2(alloc2320, model_decoder_layers_19_final_layer_norm_weight4, model_decoder_layers_19_final_layer_norm_bias4, alloc2321) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias4) model_decoder_layers_19_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] model_decoder_layers_19_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[964] gv3123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2322: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3123, R.dtype("float16")) _2321: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_19_fc1_weight4, alloc2321, model_decoder_layers_19_fc1_bias4, alloc2322) R.vm.kill_object(alloc2321) R.vm.kill_object(model_decoder_layers_19_fc1_weight4) R.vm.kill_object(model_decoder_layers_19_fc1_bias4) model_decoder_layers_19_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] model_decoder_layers_19_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[966] gv3124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2323: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3124, R.dtype("float16")) _2322: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_19_fc2_weight4, alloc2322, model_decoder_layers_19_fc2_bias4, alloc2323) R.vm.kill_object(alloc2322) R.vm.kill_object(model_decoder_layers_19_fc2_weight4) R.vm.kill_object(model_decoder_layers_19_fc2_bias4) gv3125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2324: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3125, R.dtype("float16")) cls.add5(alloc2320, alloc2323, alloc2324) R.vm.kill_object(alloc2320) R.vm.kill_object(alloc2323) model_decoder_layers_20_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[976] model_decoder_layers_20_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[977] gv3126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2325: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3126, R.dtype("float16")) cls.layer_norm2(alloc2324, model_decoder_layers_20_self_attn_layer_norm_weight4, model_decoder_layers_20_self_attn_layer_norm_bias4, alloc2325) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias4) model_decoder_layers_20_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] model_decoder_layers_20_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[973] gv3127: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2326: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3127, R.dtype("float16")) _2325: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_q_proj_weight4, alloc2325, model_decoder_layers_20_self_attn_q_proj_bias4, alloc2326) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias4) gv3128: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1233: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2326, gv3128, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2326) model_decoder_layers_20_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] gv3129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2327: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3129, R.dtype("float16")) _2326: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_20_self_attn_k_proj_weight4, alloc2325, alloc2327) R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight4) gv3130: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1234: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2327, gv3130, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2327) model_decoder_layers_20_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] model_decoder_layers_20_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[971] gv3131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2328: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3131, R.dtype("float16")) _2327: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_v_proj_weight4, alloc2325, model_decoder_layers_20_self_attn_v_proj_bias4, alloc2328) R.vm.kill_object(alloc2325) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias4) gv3132: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1235: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2328, gv3132, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2328) gv3133: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2329: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3133, R.dtype("float16")) cls.concatenate1(reshape1233, reshape1234, reshape1235, alloc2329) R.vm.kill_object(reshape1233) R.vm.kill_object(reshape1234) R.vm.kill_object(reshape1235) gv3134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1236: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2329, gv3134, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2329) gv3135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2330: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3135, R.dtype("float16")) _2329: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape1236, alloc2330) R.vm.kill_object(reshape1236) gv3136: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1237: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2330, gv3136, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2330) gv3137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1238: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1237, gv3137, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1237) model_decoder_layers_20_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] model_decoder_layers_20_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[975] gv3138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2331: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3138, R.dtype("float16")) _2330: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_out_proj_weight4, reshape1238, model_decoder_layers_20_self_attn_out_proj_bias4, alloc2331) R.vm.kill_object(reshape1238) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias4) gv3139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2332: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3139, R.dtype("float16")) cls.add5(alloc2324, alloc2331, alloc2332) R.vm.kill_object(alloc2324) R.vm.kill_object(alloc2331) model_decoder_layers_20_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[985] model_decoder_layers_20_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[986] gv3140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2333: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3140, R.dtype("float16")) cls.layer_norm2(alloc2332, model_decoder_layers_20_encoder_attn_layer_norm_weight4, model_decoder_layers_20_encoder_attn_layer_norm_bias4, alloc2333) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias4) model_decoder_layers_20_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] model_decoder_layers_20_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[982] gv3141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2334: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3141, R.dtype("float16")) _2333: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight4, alloc2333, model_decoder_layers_20_encoder_attn_q_proj_bias4, alloc2334) R.vm.kill_object(alloc2333) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias4) gv3142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1239: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2334, gv3142, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2334) gv3143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1240: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1239, gv3143, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1239) gv3144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2335: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3144, R.dtype("float16")) _2334: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape1240, alloc2335) R.vm.kill_object(reshape1240) gv3145: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1241: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2335, gv3145, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2335) gv3146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1242: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1241, gv3146, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1241) model_decoder_layers_20_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] model_decoder_layers_20_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[984] gv3147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2336: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3147, R.dtype("float16")) _2335: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight4, reshape1242, model_decoder_layers_20_encoder_attn_out_proj_bias4, alloc2336) R.vm.kill_object(reshape1242) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias4) gv3148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2337: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3148, R.dtype("float16")) cls.add5(alloc2332, alloc2336, alloc2337) R.vm.kill_object(alloc2332) R.vm.kill_object(alloc2336) model_decoder_layers_20_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[991] model_decoder_layers_20_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[992] gv3149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2338: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3149, R.dtype("float16")) cls.layer_norm2(alloc2337, model_decoder_layers_20_final_layer_norm_weight4, model_decoder_layers_20_final_layer_norm_bias4, alloc2338) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias4) model_decoder_layers_20_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] model_decoder_layers_20_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[988] gv3150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2339: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3150, R.dtype("float16")) _2338: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_20_fc1_weight4, alloc2338, model_decoder_layers_20_fc1_bias4, alloc2339) R.vm.kill_object(alloc2338) R.vm.kill_object(model_decoder_layers_20_fc1_weight4) R.vm.kill_object(model_decoder_layers_20_fc1_bias4) model_decoder_layers_20_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] model_decoder_layers_20_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[990] gv3151: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2340: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3151, R.dtype("float16")) _2339: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_20_fc2_weight4, alloc2339, model_decoder_layers_20_fc2_bias4, alloc2340) R.vm.kill_object(alloc2339) R.vm.kill_object(model_decoder_layers_20_fc2_weight4) R.vm.kill_object(model_decoder_layers_20_fc2_bias4) gv3152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2341: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3152, R.dtype("float16")) cls.add5(alloc2337, alloc2340, alloc2341) R.vm.kill_object(alloc2337) R.vm.kill_object(alloc2340) model_decoder_layers_21_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1000] model_decoder_layers_21_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1001] gv3153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2342: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3153, R.dtype("float16")) cls.layer_norm2(alloc2341, model_decoder_layers_21_self_attn_layer_norm_weight4, model_decoder_layers_21_self_attn_layer_norm_bias4, alloc2342) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias4) model_decoder_layers_21_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] model_decoder_layers_21_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[997] gv3154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2343: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3154, R.dtype("float16")) _2342: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_q_proj_weight4, alloc2342, model_decoder_layers_21_self_attn_q_proj_bias4, alloc2343) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias4) gv3155: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1243: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2343, gv3155, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2343) model_decoder_layers_21_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] gv3156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2344: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3156, R.dtype("float16")) _2343: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_21_self_attn_k_proj_weight4, alloc2342, alloc2344) R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight4) gv3157: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1244: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2344, gv3157, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2344) model_decoder_layers_21_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] model_decoder_layers_21_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[995] gv3158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2345: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3158, R.dtype("float16")) _2344: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_v_proj_weight4, alloc2342, model_decoder_layers_21_self_attn_v_proj_bias4, alloc2345) R.vm.kill_object(alloc2342) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias4) gv3159: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1245: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2345, gv3159, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2345) gv3160: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2346: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3160, R.dtype("float16")) cls.concatenate1(reshape1243, reshape1244, reshape1245, alloc2346) R.vm.kill_object(reshape1243) R.vm.kill_object(reshape1244) R.vm.kill_object(reshape1245) gv3161: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1246: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2346, gv3161, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2346) gv3162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2347: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3162, R.dtype("float16")) _2346: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape1246, alloc2347) R.vm.kill_object(reshape1246) gv3163: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1247: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2347, gv3163, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2347) gv3164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1248: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1247, gv3164, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1247) model_decoder_layers_21_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] model_decoder_layers_21_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[999] gv3165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2348: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3165, R.dtype("float16")) _2347: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_out_proj_weight4, reshape1248, model_decoder_layers_21_self_attn_out_proj_bias4, alloc2348) R.vm.kill_object(reshape1248) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias4) gv3166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2349: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3166, R.dtype("float16")) cls.add5(alloc2341, alloc2348, alloc2349) R.vm.kill_object(alloc2341) R.vm.kill_object(alloc2348) model_decoder_layers_21_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1009] model_decoder_layers_21_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1010] gv3167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2350: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3167, R.dtype("float16")) cls.layer_norm2(alloc2349, model_decoder_layers_21_encoder_attn_layer_norm_weight4, model_decoder_layers_21_encoder_attn_layer_norm_bias4, alloc2350) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias4) model_decoder_layers_21_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] model_decoder_layers_21_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1006] gv3168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2351: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3168, R.dtype("float16")) _2350: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight4, alloc2350, model_decoder_layers_21_encoder_attn_q_proj_bias4, alloc2351) R.vm.kill_object(alloc2350) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias4) gv3169: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1249: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2351, gv3169, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2351) gv3170: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1250: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1249, gv3170, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1249) gv3171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2352: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3171, R.dtype("float16")) _2351: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape1250, alloc2352) R.vm.kill_object(reshape1250) gv3172: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1251: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2352, gv3172, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2352) gv3173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1252: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1251, gv3173, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1251) model_decoder_layers_21_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] model_decoder_layers_21_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1008] gv3174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2353: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3174, R.dtype("float16")) _2352: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight4, reshape1252, model_decoder_layers_21_encoder_attn_out_proj_bias4, alloc2353) R.vm.kill_object(reshape1252) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias4) gv3175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2354: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3175, R.dtype("float16")) cls.add5(alloc2349, alloc2353, alloc2354) R.vm.kill_object(alloc2349) R.vm.kill_object(alloc2353) model_decoder_layers_21_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1015] model_decoder_layers_21_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1016] gv3176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2355: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3176, R.dtype("float16")) cls.layer_norm2(alloc2354, model_decoder_layers_21_final_layer_norm_weight4, model_decoder_layers_21_final_layer_norm_bias4, alloc2355) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias4) model_decoder_layers_21_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] model_decoder_layers_21_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1012] gv3177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2356: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3177, R.dtype("float16")) _2355: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_21_fc1_weight4, alloc2355, model_decoder_layers_21_fc1_bias4, alloc2356) R.vm.kill_object(alloc2355) R.vm.kill_object(model_decoder_layers_21_fc1_weight4) R.vm.kill_object(model_decoder_layers_21_fc1_bias4) model_decoder_layers_21_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] model_decoder_layers_21_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1014] gv3178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2357: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3178, R.dtype("float16")) _2356: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_21_fc2_weight4, alloc2356, model_decoder_layers_21_fc2_bias4, alloc2357) R.vm.kill_object(alloc2356) R.vm.kill_object(model_decoder_layers_21_fc2_weight4) R.vm.kill_object(model_decoder_layers_21_fc2_bias4) gv3179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2358: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3179, R.dtype("float16")) cls.add5(alloc2354, alloc2357, alloc2358) R.vm.kill_object(alloc2354) R.vm.kill_object(alloc2357) model_decoder_layers_22_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1024] model_decoder_layers_22_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1025] gv3180: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2359: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3180, R.dtype("float16")) cls.layer_norm2(alloc2358, model_decoder_layers_22_self_attn_layer_norm_weight4, model_decoder_layers_22_self_attn_layer_norm_bias4, alloc2359) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias4) model_decoder_layers_22_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] model_decoder_layers_22_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1021] gv3181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2360: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3181, R.dtype("float16")) _2359: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_q_proj_weight4, alloc2359, model_decoder_layers_22_self_attn_q_proj_bias4, alloc2360) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias4) gv3182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1253: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2360, gv3182, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2360) model_decoder_layers_22_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] gv3183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2361: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3183, R.dtype("float16")) _2360: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_22_self_attn_k_proj_weight4, alloc2359, alloc2361) R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight4) gv3184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1254: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2361, gv3184, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2361) model_decoder_layers_22_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] model_decoder_layers_22_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1019] gv3185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2362: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3185, R.dtype("float16")) _2361: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_v_proj_weight4, alloc2359, model_decoder_layers_22_self_attn_v_proj_bias4, alloc2362) R.vm.kill_object(alloc2359) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias4) gv3186: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1255: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2362, gv3186, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2362) gv3187: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2363: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3187, R.dtype("float16")) cls.concatenate1(reshape1253, reshape1254, reshape1255, alloc2363) R.vm.kill_object(reshape1253) R.vm.kill_object(reshape1254) R.vm.kill_object(reshape1255) gv3188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1256: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2363, gv3188, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2363) gv3189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2364: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3189, R.dtype("float16")) _2363: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape1256, alloc2364) R.vm.kill_object(reshape1256) gv3190: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1257: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2364, gv3190, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2364) gv3191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1258: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1257, gv3191, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1257) model_decoder_layers_22_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] model_decoder_layers_22_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1023] gv3192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2365: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3192, R.dtype("float16")) _2364: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_out_proj_weight4, reshape1258, model_decoder_layers_22_self_attn_out_proj_bias4, alloc2365) R.vm.kill_object(reshape1258) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias4) gv3193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2366: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3193, R.dtype("float16")) cls.add5(alloc2358, alloc2365, alloc2366) R.vm.kill_object(alloc2358) R.vm.kill_object(alloc2365) model_decoder_layers_22_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1033] model_decoder_layers_22_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1034] gv3194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2367: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3194, R.dtype("float16")) cls.layer_norm2(alloc2366, model_decoder_layers_22_encoder_attn_layer_norm_weight4, model_decoder_layers_22_encoder_attn_layer_norm_bias4, alloc2367) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias4) model_decoder_layers_22_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] model_decoder_layers_22_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1030] gv3195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2368: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3195, R.dtype("float16")) _2367: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight4, alloc2367, model_decoder_layers_22_encoder_attn_q_proj_bias4, alloc2368) R.vm.kill_object(alloc2367) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias4) gv3196: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1259: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2368, gv3196, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2368) gv3197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1260: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1259, gv3197, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1259) gv3198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2369: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3198, R.dtype("float16")) _2368: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape1260, alloc2369) R.vm.kill_object(reshape1260) gv3199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1261: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2369, gv3199, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2369) gv3200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1262: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1261, gv3200, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1261) model_decoder_layers_22_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] model_decoder_layers_22_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1032] gv3201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2370: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3201, R.dtype("float16")) _2369: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight4, reshape1262, model_decoder_layers_22_encoder_attn_out_proj_bias4, alloc2370) R.vm.kill_object(reshape1262) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias4) gv3202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2371: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3202, R.dtype("float16")) cls.add5(alloc2366, alloc2370, alloc2371) R.vm.kill_object(alloc2366) R.vm.kill_object(alloc2370) model_decoder_layers_22_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1039] model_decoder_layers_22_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1040] gv3203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2372: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3203, R.dtype("float16")) cls.layer_norm2(alloc2371, model_decoder_layers_22_final_layer_norm_weight4, model_decoder_layers_22_final_layer_norm_bias4, alloc2372) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias4) model_decoder_layers_22_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] model_decoder_layers_22_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1036] gv3204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2373: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3204, R.dtype("float16")) _2372: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_22_fc1_weight4, alloc2372, model_decoder_layers_22_fc1_bias4, alloc2373) R.vm.kill_object(alloc2372) R.vm.kill_object(model_decoder_layers_22_fc1_weight4) R.vm.kill_object(model_decoder_layers_22_fc1_bias4) model_decoder_layers_22_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] model_decoder_layers_22_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1038] gv3205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2374: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3205, R.dtype("float16")) _2373: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_22_fc2_weight4, alloc2373, model_decoder_layers_22_fc2_bias4, alloc2374) R.vm.kill_object(alloc2373) R.vm.kill_object(model_decoder_layers_22_fc2_weight4) R.vm.kill_object(model_decoder_layers_22_fc2_bias4) gv3206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2375: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3206, R.dtype("float16")) cls.add5(alloc2371, alloc2374, alloc2375) R.vm.kill_object(alloc2371) R.vm.kill_object(alloc2374) model_decoder_layers_23_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1048] model_decoder_layers_23_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1049] gv3207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2376: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3207, R.dtype("float16")) cls.layer_norm2(alloc2375, model_decoder_layers_23_self_attn_layer_norm_weight4, model_decoder_layers_23_self_attn_layer_norm_bias4, alloc2376) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias4) model_decoder_layers_23_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] model_decoder_layers_23_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1045] gv3208: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2377: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3208, R.dtype("float16")) _2376: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_q_proj_weight4, alloc2376, model_decoder_layers_23_self_attn_q_proj_bias4, alloc2377) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias4) gv3209: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1263: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2377, gv3209, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2377) model_decoder_layers_23_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] gv3210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2378: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3210, R.dtype("float16")) _2377: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_23_self_attn_k_proj_weight4, alloc2376, alloc2378) R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight4) gv3211: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1264: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2378, gv3211, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2378) model_decoder_layers_23_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] model_decoder_layers_23_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1043] gv3212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2379: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3212, R.dtype("float16")) _2378: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_v_proj_weight4, alloc2376, model_decoder_layers_23_self_attn_v_proj_bias4, alloc2379) R.vm.kill_object(alloc2376) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias4) gv3213: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1265: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2379, gv3213, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2379) gv3214: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2380: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3214, R.dtype("float16")) cls.concatenate1(reshape1263, reshape1264, reshape1265, alloc2380) R.vm.kill_object(reshape1263) R.vm.kill_object(reshape1264) R.vm.kill_object(reshape1265) gv3215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1266: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2380, gv3215, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2380) gv3216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2381: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3216, R.dtype("float16")) _2380: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape1266, alloc2381) R.vm.kill_object(reshape1266) gv3217: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1267: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2381, gv3217, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2381) gv3218: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1268: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1267, gv3218, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1267) model_decoder_layers_23_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] model_decoder_layers_23_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1047] gv3219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2382: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3219, R.dtype("float16")) _2381: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_out_proj_weight4, reshape1268, model_decoder_layers_23_self_attn_out_proj_bias4, alloc2382) R.vm.kill_object(reshape1268) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias4) gv3220: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2383: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3220, R.dtype("float16")) cls.add5(alloc2375, alloc2382, alloc2383) R.vm.kill_object(alloc2375) R.vm.kill_object(alloc2382) model_decoder_layers_23_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1057] model_decoder_layers_23_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1058] gv3221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2384: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3221, R.dtype("float16")) cls.layer_norm2(alloc2383, model_decoder_layers_23_encoder_attn_layer_norm_weight4, model_decoder_layers_23_encoder_attn_layer_norm_bias4, alloc2384) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias4) model_decoder_layers_23_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] model_decoder_layers_23_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1054] gv3222: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2385: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3222, R.dtype("float16")) _2384: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight4, alloc2384, model_decoder_layers_23_encoder_attn_q_proj_bias4, alloc2385) R.vm.kill_object(alloc2384) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias4) gv3223: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1269: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2385, gv3223, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2385) gv3224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1270: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1269, gv3224, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1269) gv3225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2386: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3225, R.dtype("float16")) _2385: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape1270, alloc2386) R.vm.kill_object(reshape1270) gv3226: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1271: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2386, gv3226, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2386) gv3227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1272: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1271, gv3227, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1271) model_decoder_layers_23_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] model_decoder_layers_23_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1056] gv3228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2387: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3228, R.dtype("float16")) _2386: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight4, reshape1272, model_decoder_layers_23_encoder_attn_out_proj_bias4, alloc2387) R.vm.kill_object(reshape1272) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias4) gv3229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2388: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3229, R.dtype("float16")) cls.add5(alloc2383, alloc2387, alloc2388) R.vm.kill_object(alloc2383) R.vm.kill_object(alloc2387) model_decoder_layers_23_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1063] model_decoder_layers_23_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1064] gv3230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2389: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3230, R.dtype("float16")) cls.layer_norm2(alloc2388, model_decoder_layers_23_final_layer_norm_weight4, model_decoder_layers_23_final_layer_norm_bias4, alloc2389) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias4) model_decoder_layers_23_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] model_decoder_layers_23_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1060] gv3231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2390: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3231, R.dtype("float16")) _2389: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_23_fc1_weight4, alloc2389, model_decoder_layers_23_fc1_bias4, alloc2390) R.vm.kill_object(alloc2389) R.vm.kill_object(model_decoder_layers_23_fc1_weight4) R.vm.kill_object(model_decoder_layers_23_fc1_bias4) model_decoder_layers_23_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] model_decoder_layers_23_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1062] gv3232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2391: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3232, R.dtype("float16")) _2390: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_23_fc2_weight4, alloc2390, model_decoder_layers_23_fc2_bias4, alloc2391) R.vm.kill_object(alloc2390) R.vm.kill_object(model_decoder_layers_23_fc2_weight4) R.vm.kill_object(model_decoder_layers_23_fc2_bias4) gv3233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2392: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3233, R.dtype("float16")) cls.add5(alloc2388, alloc2391, alloc2392) R.vm.kill_object(alloc2388) R.vm.kill_object(alloc2391) model_decoder_layers_24_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1072] model_decoder_layers_24_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1073] gv3234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2393: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3234, R.dtype("float16")) cls.layer_norm2(alloc2392, model_decoder_layers_24_self_attn_layer_norm_weight4, model_decoder_layers_24_self_attn_layer_norm_bias4, alloc2393) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias4) model_decoder_layers_24_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] model_decoder_layers_24_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1069] gv3235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2394: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3235, R.dtype("float16")) _2393: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_q_proj_weight4, alloc2393, model_decoder_layers_24_self_attn_q_proj_bias4, alloc2394) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias4) gv3236: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1273: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2394, gv3236, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2394) model_decoder_layers_24_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] gv3237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2395: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3237, R.dtype("float16")) _2394: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_24_self_attn_k_proj_weight4, alloc2393, alloc2395) R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight4) gv3238: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1274: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2395, gv3238, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2395) model_decoder_layers_24_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] model_decoder_layers_24_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1067] gv3239: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2396: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3239, R.dtype("float16")) _2395: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_v_proj_weight4, alloc2393, model_decoder_layers_24_self_attn_v_proj_bias4, alloc2396) R.vm.kill_object(alloc2393) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias4) gv3240: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1275: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2396, gv3240, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2396) gv3241: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2397: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3241, R.dtype("float16")) cls.concatenate1(reshape1273, reshape1274, reshape1275, alloc2397) R.vm.kill_object(reshape1273) R.vm.kill_object(reshape1274) R.vm.kill_object(reshape1275) gv3242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1276: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2397, gv3242, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2397) gv3243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2398: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3243, R.dtype("float16")) _2397: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape1276, alloc2398) R.vm.kill_object(reshape1276) gv3244: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1277: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2398, gv3244, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2398) gv3245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1278: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1277, gv3245, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1277) model_decoder_layers_24_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] model_decoder_layers_24_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1071] gv3246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2399: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3246, R.dtype("float16")) _2398: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_out_proj_weight4, reshape1278, model_decoder_layers_24_self_attn_out_proj_bias4, alloc2399) R.vm.kill_object(reshape1278) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias4) gv3247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2400: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3247, R.dtype("float16")) cls.add5(alloc2392, alloc2399, alloc2400) R.vm.kill_object(alloc2392) R.vm.kill_object(alloc2399) model_decoder_layers_24_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1081] model_decoder_layers_24_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1082] gv3248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2401: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3248, R.dtype("float16")) cls.layer_norm2(alloc2400, model_decoder_layers_24_encoder_attn_layer_norm_weight4, model_decoder_layers_24_encoder_attn_layer_norm_bias4, alloc2401) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias4) model_decoder_layers_24_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] model_decoder_layers_24_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1078] gv3249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2402: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3249, R.dtype("float16")) _2401: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight4, alloc2401, model_decoder_layers_24_encoder_attn_q_proj_bias4, alloc2402) R.vm.kill_object(alloc2401) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias4) gv3250: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1279: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2402, gv3250, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2402) gv3251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1280: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1279, gv3251, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1279) gv3252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2403: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3252, R.dtype("float16")) _2402: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape1280, alloc2403) R.vm.kill_object(reshape1280) gv3253: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1281: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2403, gv3253, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2403) gv3254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1282: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1281, gv3254, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1281) model_decoder_layers_24_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] model_decoder_layers_24_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1080] gv3255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2404: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3255, R.dtype("float16")) _2403: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight4, reshape1282, model_decoder_layers_24_encoder_attn_out_proj_bias4, alloc2404) R.vm.kill_object(reshape1282) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias4) gv3256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2405: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3256, R.dtype("float16")) cls.add5(alloc2400, alloc2404, alloc2405) R.vm.kill_object(alloc2400) R.vm.kill_object(alloc2404) model_decoder_layers_24_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1087] model_decoder_layers_24_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1088] gv3257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2406: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3257, R.dtype("float16")) cls.layer_norm2(alloc2405, model_decoder_layers_24_final_layer_norm_weight4, model_decoder_layers_24_final_layer_norm_bias4, alloc2406) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias4) model_decoder_layers_24_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] model_decoder_layers_24_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1084] gv3258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2407: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3258, R.dtype("float16")) _2406: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_24_fc1_weight4, alloc2406, model_decoder_layers_24_fc1_bias4, alloc2407) R.vm.kill_object(alloc2406) R.vm.kill_object(model_decoder_layers_24_fc1_weight4) R.vm.kill_object(model_decoder_layers_24_fc1_bias4) model_decoder_layers_24_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] model_decoder_layers_24_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1086] gv3259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2408: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3259, R.dtype("float16")) _2407: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_24_fc2_weight4, alloc2407, model_decoder_layers_24_fc2_bias4, alloc2408) R.vm.kill_object(alloc2407) R.vm.kill_object(model_decoder_layers_24_fc2_weight4) R.vm.kill_object(model_decoder_layers_24_fc2_bias4) gv3260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2409: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3260, R.dtype("float16")) cls.add5(alloc2405, alloc2408, alloc2409) R.vm.kill_object(alloc2405) R.vm.kill_object(alloc2408) model_decoder_layers_25_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1096] model_decoder_layers_25_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1097] gv3261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2410: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3261, R.dtype("float16")) cls.layer_norm2(alloc2409, model_decoder_layers_25_self_attn_layer_norm_weight4, model_decoder_layers_25_self_attn_layer_norm_bias4, alloc2410) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias4) model_decoder_layers_25_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] model_decoder_layers_25_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1093] gv3262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2411: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3262, R.dtype("float16")) _2410: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_q_proj_weight4, alloc2410, model_decoder_layers_25_self_attn_q_proj_bias4, alloc2411) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias4) gv3263: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1283: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2411, gv3263, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2411) model_decoder_layers_25_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] gv3264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2412: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3264, R.dtype("float16")) _2411: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_25_self_attn_k_proj_weight4, alloc2410, alloc2412) R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight4) gv3265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1284: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2412, gv3265, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2412) model_decoder_layers_25_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] model_decoder_layers_25_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1091] gv3266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2413: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3266, R.dtype("float16")) _2412: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_v_proj_weight4, alloc2410, model_decoder_layers_25_self_attn_v_proj_bias4, alloc2413) R.vm.kill_object(alloc2410) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias4) gv3267: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1285: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2413, gv3267, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2413) gv3268: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2414: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3268, R.dtype("float16")) cls.concatenate1(reshape1283, reshape1284, reshape1285, alloc2414) R.vm.kill_object(reshape1283) R.vm.kill_object(reshape1284) R.vm.kill_object(reshape1285) gv3269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1286: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2414, gv3269, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2414) gv3270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2415: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3270, R.dtype("float16")) _2414: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape1286, alloc2415) R.vm.kill_object(reshape1286) gv3271: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1287: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2415, gv3271, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2415) gv3272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1288: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1287, gv3272, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1287) model_decoder_layers_25_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] model_decoder_layers_25_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1095] gv3273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2416: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3273, R.dtype("float16")) _2415: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_out_proj_weight4, reshape1288, model_decoder_layers_25_self_attn_out_proj_bias4, alloc2416) R.vm.kill_object(reshape1288) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias4) gv3274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2417: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3274, R.dtype("float16")) cls.add5(alloc2409, alloc2416, alloc2417) R.vm.kill_object(alloc2409) R.vm.kill_object(alloc2416) model_decoder_layers_25_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1105] model_decoder_layers_25_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1106] gv3275: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2418: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3275, R.dtype("float16")) cls.layer_norm2(alloc2417, model_decoder_layers_25_encoder_attn_layer_norm_weight4, model_decoder_layers_25_encoder_attn_layer_norm_bias4, alloc2418) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias4) model_decoder_layers_25_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] model_decoder_layers_25_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1102] gv3276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2419: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3276, R.dtype("float16")) _2418: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight4, alloc2418, model_decoder_layers_25_encoder_attn_q_proj_bias4, alloc2419) R.vm.kill_object(alloc2418) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias4) gv3277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1289: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2419, gv3277, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2419) gv3278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1290: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1289, gv3278, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1289) gv3279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2420: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3279, R.dtype("float16")) _2419: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape1290, alloc2420) R.vm.kill_object(reshape1290) gv3280: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1291: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2420, gv3280, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2420) gv3281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1292: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1291, gv3281, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1291) model_decoder_layers_25_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] model_decoder_layers_25_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1104] gv3282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2421: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3282, R.dtype("float16")) _2420: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight4, reshape1292, model_decoder_layers_25_encoder_attn_out_proj_bias4, alloc2421) R.vm.kill_object(reshape1292) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias4) gv3283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2422: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3283, R.dtype("float16")) cls.add5(alloc2417, alloc2421, alloc2422) R.vm.kill_object(alloc2417) R.vm.kill_object(alloc2421) model_decoder_layers_25_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1111] model_decoder_layers_25_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1112] gv3284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2423: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3284, R.dtype("float16")) cls.layer_norm2(alloc2422, model_decoder_layers_25_final_layer_norm_weight4, model_decoder_layers_25_final_layer_norm_bias4, alloc2423) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias4) model_decoder_layers_25_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] model_decoder_layers_25_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1108] gv3285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2424: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3285, R.dtype("float16")) _2423: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_25_fc1_weight4, alloc2423, model_decoder_layers_25_fc1_bias4, alloc2424) R.vm.kill_object(alloc2423) R.vm.kill_object(model_decoder_layers_25_fc1_weight4) R.vm.kill_object(model_decoder_layers_25_fc1_bias4) model_decoder_layers_25_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] model_decoder_layers_25_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1110] gv3286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2425: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3286, R.dtype("float16")) _2424: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_25_fc2_weight4, alloc2424, model_decoder_layers_25_fc2_bias4, alloc2425) R.vm.kill_object(alloc2424) R.vm.kill_object(model_decoder_layers_25_fc2_weight4) R.vm.kill_object(model_decoder_layers_25_fc2_bias4) gv3287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2426: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3287, R.dtype("float16")) cls.add5(alloc2422, alloc2425, alloc2426) R.vm.kill_object(alloc2422) R.vm.kill_object(alloc2425) model_decoder_layers_26_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1120] model_decoder_layers_26_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1121] gv3288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2427: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3288, R.dtype("float16")) cls.layer_norm2(alloc2426, model_decoder_layers_26_self_attn_layer_norm_weight4, model_decoder_layers_26_self_attn_layer_norm_bias4, alloc2427) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias4) model_decoder_layers_26_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] model_decoder_layers_26_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1117] gv3289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2428: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3289, R.dtype("float16")) _2427: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_q_proj_weight4, alloc2427, model_decoder_layers_26_self_attn_q_proj_bias4, alloc2428) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias4) gv3290: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1293: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2428, gv3290, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2428) model_decoder_layers_26_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] gv3291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2429: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3291, R.dtype("float16")) _2428: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_26_self_attn_k_proj_weight4, alloc2427, alloc2429) R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight4) gv3292: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1294: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2429, gv3292, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2429) model_decoder_layers_26_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] model_decoder_layers_26_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1115] gv3293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2430: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3293, R.dtype("float16")) _2429: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_v_proj_weight4, alloc2427, model_decoder_layers_26_self_attn_v_proj_bias4, alloc2430) R.vm.kill_object(alloc2427) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias4) gv3294: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1295: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2430, gv3294, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2430) gv3295: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2431: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3295, R.dtype("float16")) cls.concatenate1(reshape1293, reshape1294, reshape1295, alloc2431) R.vm.kill_object(reshape1293) R.vm.kill_object(reshape1294) R.vm.kill_object(reshape1295) gv3296: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1296: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2431, gv3296, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2431) gv3297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2432: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3297, R.dtype("float16")) _2431: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape1296, alloc2432) R.vm.kill_object(reshape1296) gv3298: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1297: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2432, gv3298, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2432) gv3299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1298: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1297, gv3299, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1297) model_decoder_layers_26_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] model_decoder_layers_26_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1119] gv3300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2433: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3300, R.dtype("float16")) _2432: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_out_proj_weight4, reshape1298, model_decoder_layers_26_self_attn_out_proj_bias4, alloc2433) R.vm.kill_object(reshape1298) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias4) gv3301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2434: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3301, R.dtype("float16")) cls.add5(alloc2426, alloc2433, alloc2434) R.vm.kill_object(alloc2426) R.vm.kill_object(alloc2433) model_decoder_layers_26_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1129] model_decoder_layers_26_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1130] gv3302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2435: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3302, R.dtype("float16")) cls.layer_norm2(alloc2434, model_decoder_layers_26_encoder_attn_layer_norm_weight4, model_decoder_layers_26_encoder_attn_layer_norm_bias4, alloc2435) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias4) model_decoder_layers_26_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] model_decoder_layers_26_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1126] gv3303: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2436: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3303, R.dtype("float16")) _2435: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight4, alloc2435, model_decoder_layers_26_encoder_attn_q_proj_bias4, alloc2436) R.vm.kill_object(alloc2435) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias4) gv3304: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1299: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2436, gv3304, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2436) gv3305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1300: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1299, gv3305, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1299) gv3306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2437: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3306, R.dtype("float16")) _2436: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape1300, alloc2437) R.vm.kill_object(reshape1300) gv3307: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1301: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2437, gv3307, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2437) gv3308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1302: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1301, gv3308, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1301) model_decoder_layers_26_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] model_decoder_layers_26_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1128] gv3309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2438: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3309, R.dtype("float16")) _2437: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight4, reshape1302, model_decoder_layers_26_encoder_attn_out_proj_bias4, alloc2438) R.vm.kill_object(reshape1302) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias4) gv3310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2439: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3310, R.dtype("float16")) cls.add5(alloc2434, alloc2438, alloc2439) R.vm.kill_object(alloc2434) R.vm.kill_object(alloc2438) model_decoder_layers_26_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1135] model_decoder_layers_26_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1136] gv3311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2440: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3311, R.dtype("float16")) cls.layer_norm2(alloc2439, model_decoder_layers_26_final_layer_norm_weight4, model_decoder_layers_26_final_layer_norm_bias4, alloc2440) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias4) model_decoder_layers_26_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] model_decoder_layers_26_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1132] gv3312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2441: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3312, R.dtype("float16")) _2440: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_26_fc1_weight4, alloc2440, model_decoder_layers_26_fc1_bias4, alloc2441) R.vm.kill_object(alloc2440) R.vm.kill_object(model_decoder_layers_26_fc1_weight4) R.vm.kill_object(model_decoder_layers_26_fc1_bias4) model_decoder_layers_26_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] model_decoder_layers_26_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1134] gv3313: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2442: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3313, R.dtype("float16")) _2441: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_26_fc2_weight4, alloc2441, model_decoder_layers_26_fc2_bias4, alloc2442) R.vm.kill_object(alloc2441) R.vm.kill_object(model_decoder_layers_26_fc2_weight4) R.vm.kill_object(model_decoder_layers_26_fc2_bias4) gv3314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2443: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3314, R.dtype("float16")) cls.add5(alloc2439, alloc2442, alloc2443) R.vm.kill_object(alloc2439) R.vm.kill_object(alloc2442) model_decoder_layers_27_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1144] model_decoder_layers_27_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1145] gv3315: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2444: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3315, R.dtype("float16")) cls.layer_norm2(alloc2443, model_decoder_layers_27_self_attn_layer_norm_weight4, model_decoder_layers_27_self_attn_layer_norm_bias4, alloc2444) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias4) model_decoder_layers_27_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] model_decoder_layers_27_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1141] gv3316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2445: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3316, R.dtype("float16")) _2444: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_q_proj_weight4, alloc2444, model_decoder_layers_27_self_attn_q_proj_bias4, alloc2445) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias4) gv3317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1303: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2445, gv3317, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2445) model_decoder_layers_27_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] gv3318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2446: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3318, R.dtype("float16")) _2445: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_27_self_attn_k_proj_weight4, alloc2444, alloc2446) R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight4) gv3319: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1304: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2446, gv3319, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2446) model_decoder_layers_27_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] model_decoder_layers_27_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1139] gv3320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2447: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3320, R.dtype("float16")) _2446: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_v_proj_weight4, alloc2444, model_decoder_layers_27_self_attn_v_proj_bias4, alloc2447) R.vm.kill_object(alloc2444) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias4) gv3321: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1305: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2447, gv3321, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2447) gv3322: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2448: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3322, R.dtype("float16")) cls.concatenate1(reshape1303, reshape1304, reshape1305, alloc2448) R.vm.kill_object(reshape1303) R.vm.kill_object(reshape1304) R.vm.kill_object(reshape1305) gv3323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1306: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2448, gv3323, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2448) gv3324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2449: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3324, R.dtype("float16")) _2448: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape1306, alloc2449) R.vm.kill_object(reshape1306) gv3325: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1307: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2449, gv3325, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2449) gv3326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1308: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1307, gv3326, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1307) model_decoder_layers_27_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] model_decoder_layers_27_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1143] gv3327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2450: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3327, R.dtype("float16")) _2449: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_out_proj_weight4, reshape1308, model_decoder_layers_27_self_attn_out_proj_bias4, alloc2450) R.vm.kill_object(reshape1308) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias4) gv3328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2451: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3328, R.dtype("float16")) cls.add5(alloc2443, alloc2450, alloc2451) R.vm.kill_object(alloc2443) R.vm.kill_object(alloc2450) model_decoder_layers_27_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1153] model_decoder_layers_27_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1154] gv3329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2452: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3329, R.dtype("float16")) cls.layer_norm2(alloc2451, model_decoder_layers_27_encoder_attn_layer_norm_weight4, model_decoder_layers_27_encoder_attn_layer_norm_bias4, alloc2452) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias4) model_decoder_layers_27_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] model_decoder_layers_27_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1150] gv3330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2453: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3330, R.dtype("float16")) _2452: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight4, alloc2452, model_decoder_layers_27_encoder_attn_q_proj_bias4, alloc2453) R.vm.kill_object(alloc2452) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias4) gv3331: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1309: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2453, gv3331, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2453) gv3332: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1310: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1309, gv3332, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1309) gv3333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2454: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3333, R.dtype("float16")) _2453: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape1310, alloc2454) R.vm.kill_object(reshape1310) gv3334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1311: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2454, gv3334, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2454) gv3335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1312: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1311, gv3335, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1311) model_decoder_layers_27_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] model_decoder_layers_27_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1152] gv3336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2455: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3336, R.dtype("float16")) _2454: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight4, reshape1312, model_decoder_layers_27_encoder_attn_out_proj_bias4, alloc2455) R.vm.kill_object(reshape1312) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias4) gv3337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2456: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3337, R.dtype("float16")) cls.add5(alloc2451, alloc2455, alloc2456) R.vm.kill_object(alloc2451) R.vm.kill_object(alloc2455) model_decoder_layers_27_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1159] model_decoder_layers_27_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1160] gv3338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2457: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3338, R.dtype("float16")) cls.layer_norm2(alloc2456, model_decoder_layers_27_final_layer_norm_weight4, model_decoder_layers_27_final_layer_norm_bias4, alloc2457) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias4) model_decoder_layers_27_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] model_decoder_layers_27_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1156] gv3339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2458: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3339, R.dtype("float16")) _2457: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_27_fc1_weight4, alloc2457, model_decoder_layers_27_fc1_bias4, alloc2458) R.vm.kill_object(alloc2457) R.vm.kill_object(model_decoder_layers_27_fc1_weight4) R.vm.kill_object(model_decoder_layers_27_fc1_bias4) model_decoder_layers_27_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] model_decoder_layers_27_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1158] gv3340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2459: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3340, R.dtype("float16")) _2458: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_27_fc2_weight4, alloc2458, model_decoder_layers_27_fc2_bias4, alloc2459) R.vm.kill_object(alloc2458) R.vm.kill_object(model_decoder_layers_27_fc2_weight4) R.vm.kill_object(model_decoder_layers_27_fc2_bias4) gv3341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2460: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3341, R.dtype("float16")) cls.add5(alloc2456, alloc2459, alloc2460) R.vm.kill_object(alloc2456) R.vm.kill_object(alloc2459) model_decoder_layers_28_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1168] model_decoder_layers_28_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1169] gv3342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2461: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3342, R.dtype("float16")) cls.layer_norm2(alloc2460, model_decoder_layers_28_self_attn_layer_norm_weight4, model_decoder_layers_28_self_attn_layer_norm_bias4, alloc2461) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias4) model_decoder_layers_28_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] model_decoder_layers_28_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1165] gv3343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2462: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3343, R.dtype("float16")) _2461: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_q_proj_weight4, alloc2461, model_decoder_layers_28_self_attn_q_proj_bias4, alloc2462) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias4) gv3344: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1313: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2462, gv3344, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2462) model_decoder_layers_28_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] gv3345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2463: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3345, R.dtype("float16")) _2462: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_28_self_attn_k_proj_weight4, alloc2461, alloc2463) R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight4) gv3346: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1314: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2463, gv3346, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2463) model_decoder_layers_28_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] model_decoder_layers_28_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1163] gv3347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2464: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3347, R.dtype("float16")) _2463: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_v_proj_weight4, alloc2461, model_decoder_layers_28_self_attn_v_proj_bias4, alloc2464) R.vm.kill_object(alloc2461) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias4) gv3348: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1315: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2464, gv3348, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2464) gv3349: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2465: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3349, R.dtype("float16")) cls.concatenate1(reshape1313, reshape1314, reshape1315, alloc2465) R.vm.kill_object(reshape1313) R.vm.kill_object(reshape1314) R.vm.kill_object(reshape1315) gv3350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1316: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2465, gv3350, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2465) gv3351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2466: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3351, R.dtype("float16")) _2465: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape1316, alloc2466) R.vm.kill_object(reshape1316) gv3352: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1317: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2466, gv3352, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2466) gv3353: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1318: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1317, gv3353, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1317) model_decoder_layers_28_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] model_decoder_layers_28_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1167] gv3354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2467: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3354, R.dtype("float16")) _2466: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_out_proj_weight4, reshape1318, model_decoder_layers_28_self_attn_out_proj_bias4, alloc2467) R.vm.kill_object(reshape1318) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias4) gv3355: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2468: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3355, R.dtype("float16")) cls.add5(alloc2460, alloc2467, alloc2468) R.vm.kill_object(alloc2460) R.vm.kill_object(alloc2467) model_decoder_layers_28_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1177] model_decoder_layers_28_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1178] gv3356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2469: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3356, R.dtype("float16")) cls.layer_norm2(alloc2468, model_decoder_layers_28_encoder_attn_layer_norm_weight4, model_decoder_layers_28_encoder_attn_layer_norm_bias4, alloc2469) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias4) model_decoder_layers_28_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] model_decoder_layers_28_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1174] gv3357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2470: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3357, R.dtype("float16")) _2469: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight4, alloc2469, model_decoder_layers_28_encoder_attn_q_proj_bias4, alloc2470) R.vm.kill_object(alloc2469) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias4) gv3358: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1319: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2470, gv3358, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2470) gv3359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1320: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1319, gv3359, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1319) gv3360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2471: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3360, R.dtype("float16")) _2470: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape1320, alloc2471) R.vm.kill_object(reshape1320) gv3361: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1321: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2471, gv3361, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2471) gv3362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1322: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1321, gv3362, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1321) model_decoder_layers_28_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] model_decoder_layers_28_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1176] gv3363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2472: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3363, R.dtype("float16")) _2471: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight4, reshape1322, model_decoder_layers_28_encoder_attn_out_proj_bias4, alloc2472) R.vm.kill_object(reshape1322) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias4) gv3364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2473: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3364, R.dtype("float16")) cls.add5(alloc2468, alloc2472, alloc2473) R.vm.kill_object(alloc2468) R.vm.kill_object(alloc2472) model_decoder_layers_28_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1183] model_decoder_layers_28_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1184] gv3365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2474: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3365, R.dtype("float16")) cls.layer_norm2(alloc2473, model_decoder_layers_28_final_layer_norm_weight4, model_decoder_layers_28_final_layer_norm_bias4, alloc2474) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias4) model_decoder_layers_28_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] model_decoder_layers_28_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1180] gv3366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2475: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3366, R.dtype("float16")) _2474: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_28_fc1_weight4, alloc2474, model_decoder_layers_28_fc1_bias4, alloc2475) R.vm.kill_object(alloc2474) R.vm.kill_object(model_decoder_layers_28_fc1_weight4) R.vm.kill_object(model_decoder_layers_28_fc1_bias4) model_decoder_layers_28_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] model_decoder_layers_28_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1182] gv3367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2476: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3367, R.dtype("float16")) _2475: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_28_fc2_weight4, alloc2475, model_decoder_layers_28_fc2_bias4, alloc2476) R.vm.kill_object(alloc2475) R.vm.kill_object(model_decoder_layers_28_fc2_weight4) R.vm.kill_object(model_decoder_layers_28_fc2_bias4) gv3368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2477: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3368, R.dtype("float16")) cls.add5(alloc2473, alloc2476, alloc2477) R.vm.kill_object(alloc2473) R.vm.kill_object(alloc2476) model_decoder_layers_29_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1192] model_decoder_layers_29_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1193] gv3369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2478: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3369, R.dtype("float16")) cls.layer_norm2(alloc2477, model_decoder_layers_29_self_attn_layer_norm_weight4, model_decoder_layers_29_self_attn_layer_norm_bias4, alloc2478) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias4) model_decoder_layers_29_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] model_decoder_layers_29_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1189] gv3370: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2479: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3370, R.dtype("float16")) _2478: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_q_proj_weight4, alloc2478, model_decoder_layers_29_self_attn_q_proj_bias4, alloc2479) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias4) gv3371: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1323: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2479, gv3371, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2479) model_decoder_layers_29_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] gv3372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2480: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3372, R.dtype("float16")) _2479: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_29_self_attn_k_proj_weight4, alloc2478, alloc2480) R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight4) gv3373: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1324: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2480, gv3373, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2480) model_decoder_layers_29_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] model_decoder_layers_29_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1187] gv3374: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2481: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3374, R.dtype("float16")) _2480: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_v_proj_weight4, alloc2478, model_decoder_layers_29_self_attn_v_proj_bias4, alloc2481) R.vm.kill_object(alloc2478) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias4) gv3375: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1325: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2481, gv3375, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2481) gv3376: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2482: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3376, R.dtype("float16")) cls.concatenate1(reshape1323, reshape1324, reshape1325, alloc2482) R.vm.kill_object(reshape1323) R.vm.kill_object(reshape1324) R.vm.kill_object(reshape1325) gv3377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1326: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2482, gv3377, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2482) gv3378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2483: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3378, R.dtype("float16")) _2482: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1326, alloc2483) R.vm.kill_object(reshape1326) gv3379: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1327: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2483, gv3379, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2483) gv3380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1328: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1327, gv3380, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1327) model_decoder_layers_29_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] model_decoder_layers_29_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1191] gv3381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2484: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3381, R.dtype("float16")) _2483: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_out_proj_weight4, reshape1328, model_decoder_layers_29_self_attn_out_proj_bias4, alloc2484) R.vm.kill_object(reshape1328) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias4) gv3382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2485: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3382, R.dtype("float16")) cls.add5(alloc2477, alloc2484, alloc2485) R.vm.kill_object(alloc2477) R.vm.kill_object(alloc2484) model_decoder_layers_29_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1201] model_decoder_layers_29_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1202] gv3383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2486: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3383, R.dtype("float16")) cls.layer_norm2(alloc2485, model_decoder_layers_29_encoder_attn_layer_norm_weight4, model_decoder_layers_29_encoder_attn_layer_norm_bias4, alloc2486) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias4) model_decoder_layers_29_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] model_decoder_layers_29_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1198] gv3384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2487: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3384, R.dtype("float16")) _2486: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight4, alloc2486, model_decoder_layers_29_encoder_attn_q_proj_bias4, alloc2487) R.vm.kill_object(alloc2486) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias4) gv3385: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1329: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2487, gv3385, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2487) gv3386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1330: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1329, gv3386, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1329) gv3387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2488: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3387, R.dtype("float16")) _2487: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1330, alloc2488) R.vm.kill_object(reshape1330) gv3388: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1331: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2488, gv3388, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2488) gv3389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1332: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1331, gv3389, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1331) model_decoder_layers_29_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] model_decoder_layers_29_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1200] gv3390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2489: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3390, R.dtype("float16")) _2488: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight4, reshape1332, model_decoder_layers_29_encoder_attn_out_proj_bias4, alloc2489) R.vm.kill_object(reshape1332) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias4) gv3391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2490: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3391, R.dtype("float16")) cls.add5(alloc2485, alloc2489, alloc2490) R.vm.kill_object(alloc2485) R.vm.kill_object(alloc2489) model_decoder_layers_29_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1207] model_decoder_layers_29_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1208] gv3392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2491: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3392, R.dtype("float16")) cls.layer_norm2(alloc2490, model_decoder_layers_29_final_layer_norm_weight4, model_decoder_layers_29_final_layer_norm_bias4, alloc2491) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias4) model_decoder_layers_29_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] model_decoder_layers_29_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1204] gv3393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2492: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3393, R.dtype("float16")) _2491: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_29_fc1_weight4, alloc2491, model_decoder_layers_29_fc1_bias4, alloc2492) R.vm.kill_object(alloc2491) R.vm.kill_object(model_decoder_layers_29_fc1_weight4) R.vm.kill_object(model_decoder_layers_29_fc1_bias4) model_decoder_layers_29_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] model_decoder_layers_29_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1206] gv3394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2493: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3394, R.dtype("float16")) _2492: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_29_fc2_weight4, alloc2492, model_decoder_layers_29_fc2_bias4, alloc2493) R.vm.kill_object(alloc2492) R.vm.kill_object(model_decoder_layers_29_fc2_weight4) R.vm.kill_object(model_decoder_layers_29_fc2_bias4) gv3395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2494: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3395, R.dtype("float16")) cls.add5(alloc2490, alloc2493, alloc2494) R.vm.kill_object(alloc2490) R.vm.kill_object(alloc2493) model_decoder_layers_30_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1216] model_decoder_layers_30_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1217] gv3396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2495: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3396, R.dtype("float16")) cls.layer_norm2(alloc2494, model_decoder_layers_30_self_attn_layer_norm_weight4, model_decoder_layers_30_self_attn_layer_norm_bias4, alloc2495) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias4) model_decoder_layers_30_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] model_decoder_layers_30_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1213] gv3397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2496: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3397, R.dtype("float16")) _2495: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_q_proj_weight4, alloc2495, model_decoder_layers_30_self_attn_q_proj_bias4, alloc2496) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias4) gv3398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1333: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2496, gv3398, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2496) model_decoder_layers_30_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] gv3399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2497: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3399, R.dtype("float16")) _2496: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_30_self_attn_k_proj_weight4, alloc2495, alloc2497) R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight4) gv3400: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1334: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2497, gv3400, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2497) model_decoder_layers_30_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] model_decoder_layers_30_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1211] gv3401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2498: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3401, R.dtype("float16")) _2497: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_v_proj_weight4, alloc2495, model_decoder_layers_30_self_attn_v_proj_bias4, alloc2498) R.vm.kill_object(alloc2495) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias4) gv3402: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1335: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2498, gv3402, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2498) gv3403: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2499: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3403, R.dtype("float16")) cls.concatenate1(reshape1333, reshape1334, reshape1335, alloc2499) R.vm.kill_object(reshape1333) R.vm.kill_object(reshape1334) R.vm.kill_object(reshape1335) gv3404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1336: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2499, gv3404, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2499) gv3405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2500: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3405, R.dtype("float16")) _2499: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1336, alloc2500) R.vm.kill_object(reshape1336) gv3406: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1337: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2500, gv3406, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2500) gv3407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1338: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1337, gv3407, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1337) model_decoder_layers_30_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] model_decoder_layers_30_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1215] gv3408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2501: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3408, R.dtype("float16")) _2500: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_out_proj_weight4, reshape1338, model_decoder_layers_30_self_attn_out_proj_bias4, alloc2501) R.vm.kill_object(reshape1338) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias4) gv3409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2502: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3409, R.dtype("float16")) cls.add5(alloc2494, alloc2501, alloc2502) R.vm.kill_object(alloc2494) R.vm.kill_object(alloc2501) model_decoder_layers_30_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1225] model_decoder_layers_30_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1226] gv3410: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2503: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3410, R.dtype("float16")) cls.layer_norm2(alloc2502, model_decoder_layers_30_encoder_attn_layer_norm_weight4, model_decoder_layers_30_encoder_attn_layer_norm_bias4, alloc2503) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias4) model_decoder_layers_30_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] model_decoder_layers_30_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1222] gv3411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2504: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3411, R.dtype("float16")) _2503: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight4, alloc2503, model_decoder_layers_30_encoder_attn_q_proj_bias4, alloc2504) R.vm.kill_object(alloc2503) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias4) gv3412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1339: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2504, gv3412, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2504) gv3413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1340: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1339, gv3413, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1339) gv3414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2505: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3414, R.dtype("float16")) _2504: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1340, alloc2505) R.vm.kill_object(reshape1340) gv3415: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1341: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2505, gv3415, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2505) gv3416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1342: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1341, gv3416, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1341) model_decoder_layers_30_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] model_decoder_layers_30_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1224] gv3417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2506: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3417, R.dtype("float16")) _2505: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight4, reshape1342, model_decoder_layers_30_encoder_attn_out_proj_bias4, alloc2506) R.vm.kill_object(reshape1342) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias4) gv3418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2507: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3418, R.dtype("float16")) cls.add5(alloc2502, alloc2506, alloc2507) R.vm.kill_object(alloc2502) R.vm.kill_object(alloc2506) model_decoder_layers_30_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1231] model_decoder_layers_30_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1232] gv3419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2508: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3419, R.dtype("float16")) cls.layer_norm2(alloc2507, model_decoder_layers_30_final_layer_norm_weight4, model_decoder_layers_30_final_layer_norm_bias4, alloc2508) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias4) model_decoder_layers_30_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] model_decoder_layers_30_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1228] gv3420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2509: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3420, R.dtype("float16")) _2508: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_30_fc1_weight4, alloc2508, model_decoder_layers_30_fc1_bias4, alloc2509) R.vm.kill_object(alloc2508) R.vm.kill_object(model_decoder_layers_30_fc1_weight4) R.vm.kill_object(model_decoder_layers_30_fc1_bias4) model_decoder_layers_30_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] model_decoder_layers_30_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1230] gv3421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2510: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3421, R.dtype("float16")) _2509: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_30_fc2_weight4, alloc2509, model_decoder_layers_30_fc2_bias4, alloc2510) R.vm.kill_object(alloc2509) R.vm.kill_object(model_decoder_layers_30_fc2_weight4) R.vm.kill_object(model_decoder_layers_30_fc2_bias4) gv3422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2511: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3422, R.dtype("float16")) cls.add5(alloc2507, alloc2510, alloc2511) R.vm.kill_object(alloc2507) R.vm.kill_object(alloc2510) model_decoder_layers_31_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1240] model_decoder_layers_31_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1241] gv3423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2512: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3423, R.dtype("float16")) cls.layer_norm2(alloc2511, model_decoder_layers_31_self_attn_layer_norm_weight4, model_decoder_layers_31_self_attn_layer_norm_bias4, alloc2512) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias4) model_decoder_layers_31_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] model_decoder_layers_31_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1237] gv3424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2513: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3424, R.dtype("float16")) _2512: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_q_proj_weight4, alloc2512, model_decoder_layers_31_self_attn_q_proj_bias4, alloc2513) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias4) gv3425: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1343: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2513, gv3425, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2513) model_decoder_layers_31_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] gv3426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2514: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3426, R.dtype("float16")) _2513: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_31_self_attn_k_proj_weight4, alloc2512, alloc2514) R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight4) gv3427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1344: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2514, gv3427, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2514) model_decoder_layers_31_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] model_decoder_layers_31_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1235] gv3428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2515: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3428, R.dtype("float16")) _2514: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_v_proj_weight4, alloc2512, model_decoder_layers_31_self_attn_v_proj_bias4, alloc2515) R.vm.kill_object(alloc2512) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight4) R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias4) gv3429: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1345: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2515, gv3429, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2515) gv3430: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) alloc2516: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3430, R.dtype("float16")) cls.concatenate1(reshape1343, reshape1344, reshape1345, alloc2516) R.vm.kill_object(reshape1343) R.vm.kill_object(reshape1344) R.vm.kill_object(reshape1345) gv3431: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1346: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2516, gv3431, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) R.vm.kill_object(alloc2516) gv3432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2517: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3432, R.dtype("float16")) _2516: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1346, alloc2517) R.vm.kill_object(reshape1346) gv3433: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1347: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2517, gv3433, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2517) gv3434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1348: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1347, gv3434, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1347) model_decoder_layers_31_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] model_decoder_layers_31_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1239] gv3435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2518: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3435, R.dtype("float16")) _2517: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_out_proj_weight4, reshape1348, model_decoder_layers_31_self_attn_out_proj_bias4, alloc2518) R.vm.kill_object(reshape1348) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias4) gv3436: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2519: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3436, R.dtype("float16")) cls.add5(alloc2511, alloc2518, alloc2519) R.vm.kill_object(alloc2511) R.vm.kill_object(alloc2518) model_decoder_layers_31_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1249] model_decoder_layers_31_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1250] gv3437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2520: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3437, R.dtype("float16")) cls.layer_norm2(alloc2519, model_decoder_layers_31_encoder_attn_layer_norm_weight4, model_decoder_layers_31_encoder_attn_layer_norm_bias4, alloc2520) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias4) model_decoder_layers_31_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] model_decoder_layers_31_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1246] gv3438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2521: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3438, R.dtype("float16")) _2520: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight4, alloc2520, model_decoder_layers_31_encoder_attn_q_proj_bias4, alloc2521) R.vm.kill_object(alloc2520) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight4) R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias4) gv3439: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1349: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2521, gv3439, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2521) gv3440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) reshape1350: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1349, gv3440, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(reshape1349) gv3441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) alloc2522: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3441, R.dtype("float16")) _2521: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1350, alloc2522) R.vm.kill_object(reshape1350) gv3442: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) reshape1351: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2522, gv3442, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) R.vm.kill_object(alloc2522) gv3443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) reshape1352: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1351, gv3443, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) R.vm.kill_object(reshape1351) model_decoder_layers_31_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] model_decoder_layers_31_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1248] gv3444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2523: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3444, R.dtype("float16")) _2522: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight4, reshape1352, model_decoder_layers_31_encoder_attn_out_proj_bias4, alloc2523) R.vm.kill_object(reshape1352) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight4) R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias4) gv3445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2524: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3445, R.dtype("float16")) R.vm.kill_object(storage39) cls.add5(alloc2519, alloc2523, alloc2524) R.vm.kill_object(alloc2519) R.vm.kill_object(alloc2523) model_decoder_layers_31_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1255] model_decoder_layers_31_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1256] gv3446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2525: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3446, R.dtype("float16")) cls.layer_norm2(alloc2524, model_decoder_layers_31_final_layer_norm_weight4, model_decoder_layers_31_final_layer_norm_bias4, alloc2525) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight4) R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias4) model_decoder_layers_31_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] model_decoder_layers_31_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1252] gv3447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) alloc2526: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3447, R.dtype("float16")) R.vm.kill_object(storage37) _2525: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_31_fc1_weight4, alloc2525, model_decoder_layers_31_fc1_bias4, alloc2526) R.vm.kill_object(alloc2525) R.vm.kill_object(model_decoder_layers_31_fc1_weight4) R.vm.kill_object(model_decoder_layers_31_fc1_bias4) model_decoder_layers_31_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] model_decoder_layers_31_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1254] gv3448: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2527: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3448, R.dtype("float16")) R.vm.kill_object(storage38) _2526: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_31_fc2_weight4, alloc2526, model_decoder_layers_31_fc2_bias4, alloc2527) R.vm.kill_object(alloc2526) R.vm.kill_object(model_decoder_layers_31_fc2_weight4) R.vm.kill_object(model_decoder_layers_31_fc2_bias4) gv3449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2528: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3449, R.dtype("float16")) R.vm.kill_object(storage40) cls.add5(alloc2524, alloc2527, alloc2528) R.vm.kill_object(alloc2524) R.vm.kill_object(alloc2527) model_decoder_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1257] model_decoder_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1258] gv3450: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) alloc2529: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3450, R.dtype("float16")) R.vm.kill_object(storage41) cls.layer_norm2(alloc2528, model_decoder_layer_norm_weight4, model_decoder_layer_norm_bias4, alloc2529) R.vm.kill_object(alloc2528) R.vm.kill_object(model_decoder_layer_norm_weight4) R.vm.kill_object(model_decoder_layer_norm_bias4) storage42: R.Object = R.vm.alloc_storage(R.shape([2560]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc2530: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage42, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) R.vm.kill_object(storage42) cls.index(alloc2529, alloc2530) R.vm.kill_object(alloc2529) storage: R.Object = R.vm.alloc_storage(R.shape([207464]), R.prim_value(0), R.dtype("uint8"), R.str("global")) alloc2531: R.Tensor((1, 1, 51866), dtype="float32") = R.vm.alloc_tensor(storage, R.prim_value(0), R.shape([1, 1, 51866]), R.dtype("float32")) R.vm.kill_object(storage) _2530: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul2_cublas", model_decoder_embed_tokens_weight4, alloc2530, alloc2531) R.vm.kill_object(model_decoder_embed_tokens_weight4) R.vm.kill_object(alloc2530) return alloc2531 @R.function def renormalize_by_top_p(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), top_p: R.Tensor(("batch_size",), dtype="float32"), init_pivots: R.Tensor(("batch_size", 3), dtype="float32")) -> R.Tensor(("batch_size", "vocab_size"), dtype="float32"): batch_size = T.int64() vocab_size = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", top_p, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[1], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", init_pivots, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[2], param=init_pivots, annotation=R.Tensor((batch_size, 3), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", top_p, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[1], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", init_pivots, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(3), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[2], param=init_pivots, annotation=R.Tensor((batch_size, 3), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) cls.shape_func4(shape_heap) storage43: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv3451: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) alloc2532: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage43, R.prim_value(0), gv3451, R.dtype("float32")) R.vm.kill_object(storage43) storage44: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv3452: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) alloc2533: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage44, R.prim_value(0), gv3452, R.dtype("float32")) R.vm.kill_object(storage44) cls.top_p_pivot_cutoff(probs, top_p, init_pivots, alloc2532, alloc2533) lv6: R.Tuple(R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="float32", ndim=1)) = alloc2532, alloc2533 gv3453: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) storage45: R.Object = R.vm.alloc_storage(gv3453, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv3454: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc2534: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage45, R.prim_value(0), gv3454, R.dtype("float32")) R.vm.kill_object(storage45) cls.top_p_renorm_after_cutoff(probs, alloc2532, alloc2533, alloc2534) R.vm.kill_object(alloc2532) R.vm.kill_object(alloc2533) R.call_packed("vm.builtin.match_shape", alloc2534, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=renormalize_by_top_p, loc=return, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) return alloc2534 @R.function def sample_with_top_p(sorted_probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), sorted_indices: R.Tensor(("batch_size", "vocab_size"), dtype="int32"), uniform_samples: R.Tensor(("num_samples",), dtype="float32"), sample_indices: R.Tensor(("num_samples",), dtype="int32"), top_p: R.Tensor(("batch_size",), dtype="float32")) -> R.Tensor(("num_samples",), dtype="int32"): num_samples = T.int64() batch_size = T.int64() vocab_size = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(6),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", sorted_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[0], param=sorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", sorted_indices, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[2], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[3], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", top_p, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[4], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", sorted_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sample_with_top_p, loc=param[0], param=sorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", sorted_indices, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sample_with_top_p, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sample_with_top_p, loc=param[2], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sample_with_top_p, loc=param[3], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", top_p, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sample_with_top_p, loc=param[4], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) cls.shape_func3(shape_heap) gv2568: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) uniform_samples1: R.Tensor((num_samples, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", uniform_samples, gv2568, sinfo_args=(R.Tensor((num_samples, 1), dtype="float32"),)) gv2569: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) sample_indices1: R.Tensor((num_samples, 1), dtype="int32") = R.call_packed("vm.builtin.reshape", sample_indices, gv2569, sinfo_args=(R.Tensor((num_samples, 1), dtype="int32"),)) gv2570: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) sample_indices2: R.Tensor((batch_size, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", top_p, gv2570, sinfo_args=(R.Tensor((batch_size, 1), dtype="float32"),)) storage33: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2571: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc1978: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage33, R.prim_value(0), gv2571, R.dtype("int32")) gv2572: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=1),)) R.call_packed("vm.builtin.call_tir_dyn", cls.full, alloc1978, gv2572, sinfo_args=(R.Tuple,)) gv2573: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) storage34: R.Object = R.vm.alloc_storage(gv2573, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2574: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) lv1: R.Tensor(dtype="uint8", ndim=1) = R.vm.alloc_tensor(storage34, R.prim_value(0), gv2574, R.dtype("uint8")) R.vm.kill_object(storage34) gv2575: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(5), sinfo_args=(R.Shape(ndim=1),)) storage35: R.Object = R.vm.alloc_storage(gv2575, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2576: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc1979: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage35, R.prim_value(0), gv2576, R.dtype("float32")) R.vm.kill_object(storage35) cls.cumsum(sorted_probs, lv1, alloc1979) R.vm.kill_object(lv1) storage36: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2577: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc1980: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage36, R.prim_value(0), gv2577, R.dtype("float32")) R.vm.kill_object(storage36) cls.get_renorm_prob(alloc1979, sample_indices2, alloc1978, alloc1980) R.vm.kill_object(sample_indices2) R.vm.kill_object(alloc1978) gv2578: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc1981: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage33, R.prim_value(0), gv2578, R.dtype("int32")) R.vm.kill_object(storage33) cls.get_index_from_sorted(alloc1979, sorted_indices, alloc1980, uniform_samples1, sample_indices1, alloc1981) R.vm.kill_object(uniform_samples1) R.vm.kill_object(sample_indices1) R.vm.kill_object(alloc1979) R.vm.kill_object(alloc1980) gv2579: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) gv2: R.Tensor((num_samples,), dtype="int32") = R.call_packed("vm.builtin.reshape", alloc1981, gv2579, sinfo_args=(R.Tensor((num_samples,), dtype="int32"),)) R.vm.kill_object(alloc1981) return gv2 @R.function def sampler_take_probs(unsorted_probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), sorted_indices: R.Tensor(("batch_size", "vocab_size"), dtype="int32"), sample_indices: R.Tensor(("num_samples",), dtype="int32"), sampling_result: R.Tensor(("num_samples",), dtype="int32"), lobprob_offsets: R.Tensor(("num_positions",), dtype="int32")) -> R.Tuple(R.Tensor(("num_samples",), dtype="float32"), R.Tensor(("num_positions",), dtype="float32"), R.Tensor(("num_positions",), dtype="int32")): num_samples = T.int64() num_positions = T.int64() batch_size = T.int64() vocab_size = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(4),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", unsorted_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[0], param=unsorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", sorted_indices, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", sampling_result, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[3], param=sampling_result, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", lobprob_offsets, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[4], param=lobprob_offsets, annotation=R.Tensor((num_positions,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", unsorted_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sampler_take_probs, loc=param[0], param=unsorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", sorted_indices, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sampler_take_probs, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", sampling_result, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=param[3], param=sampling_result, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", lobprob_offsets, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=param[4], param=lobprob_offsets, annotation=R.Tensor((num_positions,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) storage: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) alloc: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage, R.prim_value(0), gv, R.dtype("float32")) R.vm.kill_object(storage) storage1: R.Object = R.vm.alloc_storage(R.shape([192]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv1: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) alloc1: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage1, R.prim_value(0), gv1, R.dtype("float32")) R.vm.kill_object(storage1) storage2: R.Object = R.vm.alloc_storage(R.shape([192]), R.prim_value(0), R.dtype("uint8"), R.str("global")) gv2: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) alloc2: R.Tensor(dtype="int32", ndim=1) = R.vm.alloc_tensor(storage2, R.prim_value(0), gv2, R.dtype("int32")) R.vm.kill_object(storage2) cls.sampler_take_probs_tir(unsorted_probs, sorted_indices, sample_indices, sampling_result, lobprob_offsets, alloc, alloc1, alloc2) gv3: R.Tuple(R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="int32", ndim=1)) = alloc, alloc1, alloc2 R.vm.kill_object(alloc) R.vm.kill_object(alloc1) R.vm.kill_object(alloc2) gv3_1: R.Tensor(dtype="float32", ndim=1) = gv3[0] R.call_packed("vm.builtin.match_shape", gv3_1, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) gv4: R.Tensor(dtype="float32", ndim=1) = gv3[1] R.call_packed("vm.builtin.match_shape", gv4, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) gv5: R.Tensor(dtype="int32", ndim=1) = gv3[2] R.call_packed("vm.builtin.match_shape", gv5, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) return gv3 @R.function def sampler_verify_draft_tokens(draft_probs: R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), draft_tokens: R.Tensor(("num_nodes",), dtype="int32"), model_probs: R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), token_tree_first_child: R.Tensor(("num_nodes",), dtype="int32"), token_tree_next_sibling: R.Tensor(("num_nodes",), dtype="int32"), uniform_samples: R.Tensor(("num_nodes",), dtype="float32"), token_tree_parent_ptr: R.Tensor(("nbatch",), dtype="int32")) -> R.Tuple(R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), R.Tensor(("nbatch",), dtype="int32")): num_nodes = T.int64() vocab_size = T.int64() nbatch = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", draft_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[0], param=draft_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", draft_tokens, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[1], param=draft_tokens, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", model_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[2], param=model_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", token_tree_first_child, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[3], param=token_tree_first_child, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", token_tree_next_sibling, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[4], param=token_tree_next_sibling, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[5], param=uniform_samples, annotation=R.Tensor((num_nodes,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", token_tree_parent_ptr, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[6], param=token_tree_parent_ptr, annotation=R.Tensor((nbatch,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", draft_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[0], param=draft_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", draft_tokens, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[1], param=draft_tokens, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", model_probs, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[2], param=model_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", token_tree_first_child, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[3], param=token_tree_first_child, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", token_tree_next_sibling, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[4], param=token_tree_next_sibling, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[5], param=uniform_samples, annotation=R.Tensor((num_nodes,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", token_tree_parent_ptr, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[6], param=token_tree_parent_ptr, annotation=R.Tensor((nbatch,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) cls.batch_verify_on_gpu_single_kernel(draft_probs, draft_tokens, model_probs, token_tree_first_child, token_tree_next_sibling, uniform_samples, token_tree_parent_ptr) gv4: R.Tuple(R.Tensor((num_nodes, vocab_size), dtype="float32"), R.Tensor((nbatch,), dtype="int32")) = model_probs, token_tree_parent_ptr return gv4 @R.function def softmax_with_temperature(logits: R.Tensor(("batch_size", 1, "vocab_size"), dtype="float32"), temperature: R.Tensor(("batch_size",), dtype="float32")) -> R.Tensor(("batch_size", 1, "vocab_size"), dtype="float32"): batch_size = T.int64() vocab_size = T.int64() R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) cls = Module shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) R.call_packed("vm.builtin.check_tensor_info", logits, R.prim_value(3), R.dtype("float32"), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[0], param=logits, annotation=R.Tensor((batch_size, 1, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.check_tensor_info", temperature, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[1], param=temperature, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", logits, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[0], param=logits, annotation=R.Tensor((batch_size, 1, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) R.call_packed("vm.builtin.match_shape", temperature, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[1], param=temperature, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) cls.shape_func5(shape_heap) gv3455: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) lv: R.Tensor((batch_size, vocab_size), dtype="float32") = R.call_packed("vm.builtin.reshape", logits, gv3455, sinfo_args=(R.Tensor((batch_size, vocab_size), dtype="float32"),)) gv3456: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) storage46: R.Object = R.vm.alloc_storage(gv3456, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv3457: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=2),)) alloc2535: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage46, R.prim_value(0), gv3457, R.dtype("float32")) R.vm.kill_object(storage46) gv3458: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) storage47: R.Object = R.vm.alloc_storage(gv3458, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv3459: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=2),)) alloc2536: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage47, R.prim_value(0), gv3459, R.dtype("float32")) R.vm.kill_object(storage47) cls.chunk_lse(lv, temperature, alloc2535, alloc2536) lv1: R.Tuple(R.Tensor(dtype="float32", ndim=2), R.Tensor(dtype="float32", ndim=2)) = alloc2535, alloc2536 gv3460: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) storage48: R.Object = R.vm.alloc_storage(gv3460, R.prim_value(0), R.dtype("uint8"), R.str("global")) gv3461: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) alloc2537: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage48, R.prim_value(0), gv3461, R.dtype("float32")) R.vm.kill_object(storage48) cls.softmax_with_chunked_sum(lv, temperature, alloc2535, alloc2536, alloc2537) R.vm.kill_object(lv) R.vm.kill_object(alloc2535) R.vm.kill_object(alloc2536) gv3462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=3),)) gv: R.Tensor((batch_size, 1, vocab_size), dtype="float32") = R.call_packed("vm.builtin.reshape", alloc2537, gv3462, sinfo_args=(R.Tensor((batch_size, 1, vocab_size), dtype="float32"),)) R.vm.kill_object(alloc2537) return gv # Metadata omitted. Use show_meta=True in script() method to show it.