sesame-csm-1b-GGUF-encoder / examples /tts /convert_mimi_to_gguf.py

Upload folder using huggingface_hub

e80739d verified 7 months ago

9.57 kB

	import gguf
	import argparse
	import logging
	import torch
	import json # Import json
	from typing import Union
	from pathlib import Path
	from torch import Tensor
	from transformers import MimiModel, PreTrainedModel

	logger = logging.getLogger("mimi")


	class MimiModelConverter:
	mimi_model: PreTrainedModel
	gguf_writer: gguf.GGUFWriter
	fname_out: Path
	ftype: gguf.LlamaFileType

	def __init__(self,
	pretrained_model_name_or_path: Union[Path, str],
	fname_out: Path,
	ftype: gguf.LlamaFileType,
	is_big_endian: bool,):
	# --- Load Model ---
	self.mimi_model = MimiModel.from_pretrained(pretrained_model_name_or_path)
	self.config = self.mimi_model.config # Store config for easier access
	logger.info(f"Loaded model config: {self.config}")

	self.fname_out = fname_out
	self.ftype = ftype
	endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
	# --- Initialize GGUF Writer ---
	self.gguf_writer = gguf.GGUFWriter(
	path=None, # Path set during write
	arch="mimi", # Set arch to 'mimi' instead of warning message
	endianess=endianess)

	# --- Add Metadata ---
	logger.info("Adding metadata keys...")
	# General Mimi parameters (adjust key names if C++ code expects differently)
	# self.gguf_writer.add_architecture() # Explicitly set architecture (REMOVED - handled by init)
	self.gguf_writer.add_uint32("mimi.sample_rate", self.config.sampling_rate)
	self.gguf_writer.add_uint32("mimi.hidden_size", self.config.hidden_size) # Assuming a general hidden size if available
	self.gguf_writer.add_uint32("mimi.num_hidden_layers", self.config.num_hidden_layers) # The one confirmed missing
	self.gguf_writer.add_uint32("mimi.intermediate_size", self.config.intermediate_size)

	# Encoder specific (assuming these exist in config)
	if hasattr(self.config, 'encoder_hidden_size'):
	self.gguf_writer.add_uint32("mimi.encoder.hidden_size", self.config.encoder_hidden_size)
	# Add other encoder params if needed, e.g., embedding dim, num layers if different

	# Decoder specific (assuming these exist in config)
	if hasattr(self.config, 'decoder_hidden_size'):
	self.gguf_writer.add_uint32("mimi.decoder.hidden_size", self.config.decoder_hidden_size)
	# Add other decoder params if needed

	# RVQ specific (check exact names in config.json or C++ code)
	# Using common names found in similar models, adjust if needed.
	if hasattr(self.config, 'num_codebooks'):
	self.gguf_writer.add_uint32("mimi.rvq.num_quantizers", self.config.num_codebooks)
	if hasattr(self.config, 'codebook_dim'):
	self.gguf_writer.add_uint32("mimi.rvq.codebook_dim", self.config.codebook_dim)
	if hasattr(self.config, 'codebook_size'):
	self.gguf_writer.add_uint32("mimi.rvq.codebook_size", self.config.codebook_size) # Might be needed by C++

	logger.info("Finished adding metadata keys.")

	assert self.config.architectures[0] == "MimiModel"

	# --- Load and Add Tensors ---
	logger.info("Processing and adding tensors...")
	for name, data_torch in self.mimi_model.state_dict().items():
	# convert any unsupported data types to float32
	old_dtype = data_torch.dtype
	if data_torch.dtype not in (torch.float16, torch.float32):
	data_torch = data_torch.to(torch.float32)
	self.add_tensor(name, data_torch, old_dtype)
	logger.info("Finished processing tensors.")

	def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype):
	is_1d = len(data_torch.shape) == 1
	is_bias = ".bias" in name
	can_quantize = not is_1d and not is_bias
	data_qtype = gguf.GGMLQuantizationType.F32

	n_head = self.mimi_model.config.num_attention_heads
	n_kv_head = self.mimi_model.config.num_key_value_heads
	if name.endswith(("q_proj.weight", "q_proj.bias")):
	data_torch = self.undo_permute(data_torch, n_head, n_head)
	if name.endswith(("k_proj.weight", "k_proj.bias")):
	data_torch = self.undo_permute(data_torch, n_head, n_kv_head)

	# process codebook
	if ".codebook.initialized" in name:
	# "initialized" tensor
	state_dict = self.mimi_model.state_dict()
	embed_sum = state_dict[name.replace(".initialized", ".embed_sum")]
	cluster_usage = state_dict[name.replace(".initialized", ".cluster_usage")]
	# see modeling_mimi.py --> MimiEuclideanCodebook
	data_torch = embed_sum / cluster_usage.clamp(min=self.mimi_model.config.norm_eps)[:, None]
	name = name.replace(".initialized", "")

	# ignore processed tensors
	if ".cluster_usage" in name or ".embed_sum" in name:
	return

	# transpose some tensors
	if ".conv.bias" in name:
	data_torch = data_torch.view((1, data_torch.shape[0]))
	data_torch = data_torch.transpose(0, 1)

	# change view 3d to 2d
	if "quantizer" in name and "_proj." in name:
	assert data_torch.shape[2] == 1
	data_torch = data_torch.view((data_torch.shape[0], data_torch.shape[1]))

	# shorten name, otherwise it will be too long for ggml to read
	name = name.replace("_residual_vector_quantizer", "_rvq")

	if can_quantize:
	if self.ftype == gguf.LlamaFileType.ALL_F32:
	data_qtype = gguf.GGMLQuantizationType.F32
	elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
	data_qtype = gguf.GGMLQuantizationType.F16
	elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
	data_qtype = gguf.GGMLQuantizationType.BF16
	elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
	data_qtype = gguf.GGMLQuantizationType.Q8_0
	else:
	raise ValueError(f"Unsupported file type: {self.ftype}")

	# Conv kernels are always F16
	if ".conv.weight" in name:
	data_qtype = gguf.GGMLQuantizationType.F16

	data = data_torch.numpy()

	try:
	data = gguf.quants.quantize(data, data_qtype)
	except Exception as e:
	logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
	data_qtype = gguf.GGMLQuantizationType.F16
	data = gguf.quants.quantize(data, data_qtype)

	# reverse shape to make it similar to the internal ggml dimension order
	shape_str = f"{{\'{', '.join(str(n) for n in reversed(data_torch.shape))}\'}}"
	# Reduce verbosity slightly by default, uncomment if needed for deep debug
	# logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")

	self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)

	def write(self):
	self.gguf_writer.write_header_to_file(path=self.fname_out)
	self.gguf_writer.write_kv_data_to_file()
	self.gguf_writer.write_tensors_to_file(progress=True)
	self.gguf_writer.close()
	logger.info(f"Model successfully converted and saved to {self.fname_out}") # Added confirmation message

	@staticmethod
	def undo_permute(weights: Tensor, n_head: int, n_head_kv: int):
	if n_head_kv is not None and n_head != n_head_kv:
	n_head = n_head_kv
	return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
	.swapaxes(1, 2)
	.reshape(weights.shape))

	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Convert Mimi safetensors model to GGUF with metadata",) # Updated description
	parser.add_argument(
	"--outfile", type=Path, default="kyutai-mimi.gguf",
	help="path to write to",
	)
	parser.add_argument(
	"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
	help="output format",
	)
	parser.add_argument(
	"--bigendian", action="store_true",
	help="model is executed on big endian machine",
	)
	parser.add_argument(
	"model", type=str,
	help="directory or model ID containing model file (if model ID is specified, download from Hugging Face hub)",
	nargs="?",
	default="kyutai/mimi",
	)
	parser.add_argument(
	"--verbose", action="store_true",
	help="increase output verbosity",
	)

	args = parser.parse_args()
	if args.model is None:
	parser.error("the following arguments are required: model")
	else:
	logging.basicConfig(level=logging.INFO)

	dir_model = args.model
	fname_out = args.outfile # Use outfile argument

	ftype_map: dict[str, gguf.LlamaFileType] = {
	"f32": gguf.LlamaFileType.ALL_F32,
	"f16": gguf.LlamaFileType.MOSTLY_F16,
	"bf16": gguf.LlamaFileType.MOSTLY_BF16,
	"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
	}

	logger.info(f"Loading model: {dir_model}")

	with torch.inference_mode():
	converter = MimiModelConverter(
	pretrained_model_name_or_path=dir_model,
	fname_out=fname_out, # Pass fname_out here
	ftype=ftype_map[args.outtype],
	is_big_endian=args.bigendian,
	)
	converter.write()


	if __name__ == '__main__':
	parse_args()