#include "ggml.h" #include "ggml-cpp.h" #include "ggml-cpu.h" #include "ggml-alloc.h" #include "ggml-backend.h" #include "gguf.h" #include "common.h" #include "mimi-model.h" #include #include #include #include #include #include #include #include #include #include #include /** * Implementation of Kyutai's Mimi model using GGML. * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp * * NOTE: only decoder is working for now. * * Background: * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc * - Audio codes must be in the order: N semantic codes followed by (N*31) acoustic codes * (In other words, input matrix has shape 32 cols x N rows) * * How it works? * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale * 3. The upscaled code is passed to transformer, it converts N frames to N frames * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform * 5. Waveform is written to a file */ // copied from https://huggingface.co/kyutai/mimi/blob/main/config.json struct mimi_config_t { bool causal = true; int sample_rate = 24000; int max_position_embeddings = 8000; int num_hidden_layers = 8; int n_embd = 512; int n_ffn = 2048; int n_head = 8; int n_head_kv = 8; int n_rot = 64; float norm_eps = 1e-5; float rope_theta = 10000.0f; int sliding_window = 250; std::array upsampling_ratio = {8, 6, 5, 4}; std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio // vector quantizer float frame_rate = 12.5; int audio_channels = 1; int codebook_size = 2048; int codebook_dim = 256; int n_semantic_components = 1; int n_acoustic_components = 31; // decode float trim_right_ratio = 1.0f; int n_codes_per_frame = (sliding_window / 2) * (n_semantic_components + n_acoustic_components); } mimi_config; // Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h struct mimi_ggml_ctx { gguf_context * ctx_gguf = nullptr; ggml_context * ctx_data = nullptr; ggml_context * ctx_gf = nullptr; // CPU-only for now, as many kernels are missing and we actually get less performance with GPU ggml_backend_t backend = nullptr; ggml_backend_buffer_t buf = nullptr; ggml_backend_sched_ptr sched; ggml_cgraph * gf = nullptr; std::vector buf_compute_meta; int max_nodes = 16 * 1024; std::unordered_map tensors; mimi_ggml_ctx() { backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); auto buft = ggml_backend_get_default_buffer_type(backend); sched.reset( ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) ); buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); } void load_gguf(const char * fname) { ggml_context * meta = nullptr; gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &meta, }; ctx_gguf = gguf_init_from_file(fname, params); // load tensors const int n_tensors = gguf_get_n_tensors(ctx_gguf); std::vector read_buf; ggml_init_params ggml_params = { /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; ctx_data = ggml_init(ggml_params); auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { ggml_free(meta); throw std::runtime_error("cannot open model file for loading tensors"); } // add tensors to context for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx_gguf, i); ggml_tensor * t = ggml_get_tensor(meta, name); ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); ggml_set_name(cur, name); tensors.insert({name, cur}); } // alloc memory and offload data ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx_gguf, i); ggml_tensor * cur = ggml_get_tensor(ctx_data, name); const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); // printf("%s: Loading tensor \"%s\"\n", __func__, name); fin.seekg(offset, std::ios::beg); if (!fin) { ggml_free(meta); throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); } int num_bytes = ggml_nbytes(cur); if (ggml_backend_buft_is_host(buft)) { // for the CPU and Metal backend, we can read directly into the tensor fin.read(reinterpret_cast(cur->data), num_bytes); } else { // read into a temporary buffer first, then copy to device memory read_buf.resize(num_bytes); fin.read(reinterpret_cast(read_buf.data()), num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); } } printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); fin.close(); ggml_free(meta); } /** * Build a cgraph using the given builder function. * * The built cgraph will be stored in `ctx.gf` */ void build_graph(std::function builder_fn) { ggml_free(ctx_gf); struct ggml_init_params params = { /*.mem_size =*/ buf_compute_meta.size(), /*.mem_buffer =*/ buf_compute_meta.data(), /*.no_alloc =*/ true, }; ctx_gf = ggml_init(params); ggml_backend_sched_reset(sched.get()); gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); builder_fn(ctx_gf, gf); ggml_backend_sched_alloc_graph(sched.get(), gf); } ggml_status compute() { ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); return status; } void set_tensor_data(const std::string & name, const void * data) { ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); if (!t) { throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); } ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); } std::pair> get_tensor_data(const std::string & name) { ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); if (!t) { throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); } std::vector data(ggml_nbytes(t)); ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); return std::make_pair(t, data); } ggml_tensor * get_weight(const char *fmt, ...) { std::vector str(128); va_list va; va_start(va, fmt); vsnprintf(str.data(), 128, fmt, va); va_end(va); auto it = tensors.find(str.data()); if (it == tensors.end()) { throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); } return it->second; } ~mimi_ggml_ctx() { ggml_free(ctx_data); gguf_free(ctx_gguf); ggml_backend_buffer_free(buf); } }; /////////////////////////////////////////////////////////////////////////// // extension to ggml.h // TODO: add these ops to the library (ofc with a more optimized kernel) // mode: (0) constant, (1) reflect, (2) replicate, (3) circular // value is only used in "constant" // only "constant" with 0.0f and "replicate" are implemented here static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, int64_t pad_left, int64_t pad_right, float value = 0.0f) { GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f GGML_ASSERT(mode == 0 || mode == 2); if (pad_left > 0) { ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); if (mode == 0) { tmp = ggml_scale(ctx0, tmp, value); } else if (mode == 2) { ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column tmp = ggml_repeat(ctx0, elem, tmp); } x = ggml_concat(ctx0, tmp, x, 0); } if (pad_right > 0) { ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); if (mode == 0) { tmp = ggml_scale(ctx0, tmp, value); } else if (mode == 2) { int64_t last = x->ne[0] - 1; ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column tmp = ggml_repeat(ctx0, elem, tmp); } x = ggml_concat(ctx0, x, tmp, 0); } return x; } /////////////////////////////////////////////////////////////////////////// // MimiConv and MimiConvTranspose static int64_t div_ceil(int64_t a, int64_t b) { return a / b + (a % b ? 1 : 0); } static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; int64_t p_total = kernel_size - stride; // padding total int64_t p_half = p_total / 2; int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); int64_t ideal_len = n_frames * stride + kernel_size - p_total; int64_t p_extra = ideal_len - x->ne[0]; int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); if (bias) { x = ggml_add(ctx0, x, bias); } ggml_set_name(x, "mimi_conv_1d"); return x; } static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { GGML_ASSERT(x->ne[1] == kernel->ne[2]); int64_t n_rows = x->ne[1]; int64_t kernel_size = kernel->ne[0]; int64_t p_total = kernel_size - stride; // padding total int64_t p_right = mimi_config.causal ? (float)p_total / mimi_config.trim_right_ratio : p_total / 2; int64_t p_left = p_total - p_right; ggml_tensor * out = nullptr; if (depthwise) { for (int64_t ir = 0; ir < n_rows; ir++) { ggml_tensor * row = ggml_view_1d(ctx0, x, x->ne[0], ir*x->ne[0]*ggml_element_size(x)); ggml_tensor * krn = ggml_view_1d(ctx0, kernel, kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); // unpad (remove p_right and p_left columns) row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc out = out ? ggml_concat(ctx0, out, row, 1) : row; } } else { out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); // unpad out = ggml_view_2d(ctx0, out, out->ne[0] - p_total, out->ne[1], out->nb[1], p_left*ggml_element_size(out)); } if (bias) { out = ggml_add(ctx0, out, bias); } return out; } /////////////////////////////////////////////////////////////////////////// // based on MimiEncoder // SEANet encoder as used by Mimi. struct mimi_encoder_decoder { mimi_ggml_ctx & ctx; struct layer { bool is_elu = false; bool is_resnet = false; bool is_transposed_conv = false; ggml_tensor * conv_0_w = nullptr; ggml_tensor * conv_0_b = nullptr; ggml_tensor * conv_1_w = nullptr; ggml_tensor * conv_1_b = nullptr; int stride = 1; }; std::vector layers; std::array repeated_pattern = {1, 4, 7, 10}; mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { layers.push_back({ .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), }); for (int i = 0; i < (int)repeated_pattern.size(); ++i) { int i_start = repeated_pattern[i]; // upsampling layers layers.push_back({ .is_elu = true, // layer (i_start) }); layers.push_back({ .is_transposed_conv = true, .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), .stride = mimi_config.upsampling_ratio[i], }); // residual layers layers.push_back({ .is_resnet = true, .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), }); } layers.push_back({ .is_elu = true, // layer 13 }); layers.push_back({ .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), }); } ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { ggml_tensor * x = input; for (auto & layer : layers) { if (layer.is_elu) { x = ggml_elu(ctx0, x); } else if (layer.is_resnet) { ggml_tensor * residual = x; x = ggml_elu(ctx0, x); x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); x = ggml_elu(ctx0, x); x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); x = ggml_add(ctx0, x, residual); } else { x = layer.is_transposed_conv ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); } } return x; } }; struct mimi_transformer { struct layer { ggml_tensor * inp_norm_w = nullptr; ggml_tensor * inp_norm_b = nullptr; ggml_tensor * attn_q = nullptr; ggml_tensor * attn_k = nullptr; ggml_tensor * attn_v = nullptr; ggml_tensor * attn_o = nullptr; ggml_tensor * attn_post_norm_w = nullptr; ggml_tensor * attn_post_norm_b = nullptr; ggml_tensor * attn_layer_scale = nullptr; ggml_tensor * ffn_up = nullptr; ggml_tensor * ffn_down = nullptr; ggml_tensor * mlp_layer_scale = nullptr; }; std::vector layers; mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { for (int il = 0; il < n_layers; il++) { layers.push_back({ .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), }); } } ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { int n_tokens = input->ne[1]; ggml_tensor * x = input; auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { x = ggml_norm(ctx0, x, mimi_config.norm_eps); x = ggml_mul(ctx0, x, w); x = ggml_add(ctx0, x, b); return x; }; ggml_tensor * residual = input; for (auto & layer : layers) { residual = x; // input layer norm x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); // self attention { ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); int n_embd_head = mimi_config.n_embd / mimi_config.n_head; q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); int n_rot = n_embd_head; q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); kq = ggml_soft_max_inplace(ctx0, kq_masked); v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); x = ggml_mul_mat(ctx0, layer.attn_o, kqv); } // residual x = ggml_mul(ctx0, x, layer.attn_layer_scale); x = ggml_add(ctx0, x, residual); residual = x; x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); // mlp { x = ggml_mul_mat(ctx0, layer.ffn_up, x); x = ggml_gelu(ctx0, x); x = ggml_mul_mat(ctx0, layer.ffn_down, x); } // residual x = ggml_mul(ctx0, x, layer.mlp_layer_scale); x = ggml_add(ctx0, x, residual); } return x; } }; struct mimi_residual_vector_quantizer { struct component { ggml_tensor * codebook; }; ggml_tensor * semantic_inp_proj; std::vector semantic_components; ggml_tensor * semantic_out_proj; ggml_tensor * acoustic_inp_proj; std::vector acoustic_components; ggml_tensor * acoustic_out_proj; mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); for (int i = 0; i < mimi_config.n_semantic_components; i++) { semantic_components.push_back({ .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), }); } acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); for (int i = 0; i < mimi_config.n_acoustic_components; i++) { acoustic_components.push_back({ .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), }); } } // the input has shape [n_codes, n_codes_per_embd] // first row is semantic, the rest are acoustic // example: [ [semantic], [acoustic1], [acoustic2], ... ] ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { GGML_ASSERT(input->type == GGML_TYPE_I32); size_t n_semantic = semantic_components.size(); int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); int64_t n_codes = input->ne[0] / n_codes_per_embd; GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); out_s = ggml_scale(ctx0, out_s, 0.0f); // clear out_a = ggml_scale(ctx0, out_a, 0.0f); // clear for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); if (ir < n_semantic) { // semantic ggml_tensor * codebook = semantic_components[ir].codebook; ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); out_s = ggml_add(ctx0, out_s, embd); } else { // acoustic ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); out_a = ggml_add(ctx0, out_a, embd); } } out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); return ggml_add(ctx0, out_s, out_a); } }; mimi_model::mimi_model(const char * fname, bool verbose) : verbose(verbose) { ctx.reset(new mimi_ggml_ctx()); ctx->load_gguf(fname); // initialize components seanet_dec .reset(new mimi_encoder_decoder(*ctx)); transformer_dec.reset(new mimi_transformer(*ctx, "decoder", mimi_config.num_hidden_layers)); quantizer .reset(new mimi_residual_vector_quantizer(*ctx)); } mimi_model::~mimi_model() { } std::vector mimi_model::decode_frame(const std::vector & codes, int & n_past) { // build cgraph int n_pos = -1; int n_codes = codes.size(); int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); ggml_set_name(inp_dec, "inp_dec"); ggml_set_input(inp_dec); // RVQ ggml_tensor * embeddings = quantizer->decode(ctx_gf, inp_dec); // upsample embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx->get_weight("upsample.conv.weight"), nullptr, 2, 1, true); // transformer n_pos = embeddings->ne[0]; ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); ggml_set_name(pos_dec, "pos_dec"); ggml_set_input(pos_dec); embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); embeddings = transformer_dec->forward(ctx_gf, embeddings, pos_dec); // SEANET decoder embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); ggml_tensor * output = seanet_dec->forward(ctx_gf, embeddings); ggml_set_name(output, "output"); ggml_set_output(output); ggml_build_forward_expand(gf, output); }); // position data GGML_ASSERT(n_pos <= mimi_config.sliding_window); std::vector pos_data(n_pos); for (int i = 0; i < (int)pos_data.size(); i++) { pos_data[i] = i + n_past; } if (verbose) { printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past); } n_past += n_pos; ctx->set_tensor_data("pos_dec", pos_data.data()); // code data auto codes_T = mimi_model::transpose_input(codes); ctx->set_tensor_data("inp_dec", codes_T.data()); ctx->compute(); auto output = ctx->get_tensor_data("output"); // auto output_tensor = output.first; auto output_data = output.second; // printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); std::vector wav_data(output_data.size() / sizeof(float)); for (size_t i = 0; i < wav_data.size(); i++) { wav_data[i] = ((float *)output_data.data())[i]; } return wav_data; } std::vector mimi_model::decode(const std::vector & codes) { std::vector output; if (verbose) { printf("%s: n_codes: %zu\n", __func__, codes.size()); } int64_t t_start = ggml_time_ms(); int n_frames = 0; int n_past = 0; for (size_t i = 0; i < codes.size(); i += mimi_config.n_codes_per_frame) { size_t remaining = std::min((size_t)mimi_config.n_codes_per_frame, codes.size() - i); std::vector frame(codes.begin() + i, codes.begin() + i + remaining); auto wav_data = decode_frame(frame, n_past); output.insert(output.end(), wav_data.begin(), wav_data.end()); n_frames++; } int64_t t_end = ggml_time_ms(); if (verbose) { printf("%s: n_frames: %d, time: %" PRId64 "ms, per_frame: %" PRId64 "ms\n", __func__, n_frames, t_end - t_start, (t_end - t_start) / n_frames); } return output; } std::vector mimi_model::transpose_input(const std::vector & codes) { int n_codes = codes.size(); int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); std::vector codes_T(n_codes); for (int i = 0; i < n_codes / n_codes_per_embd; i++) { for (int j = 0; j < n_codes_per_embd; j++) { int src_idx = i * n_codes_per_embd + j; int dst_idx = j * (n_codes / n_codes_per_embd) + i; codes_T[dst_idx] = codes[src_idx]; } } return codes_T; } int mimi_model::get_sample_rate() const { return mimi_config.sample_rate; }