Please, help me with GGUF coding!
Hello, I'm developing my own Streamlit app to train language models from scratch. It works well, but when I export model as GGUF and load it in text-generation-webui and send a message, it can't tokenize, and shows error llama.cpp error: 'invalid unordered_map<K, T> key
in console. Can you with experience with GGUF help me? I'm getting desperate.
export_backend.py
import torch
import json
import numpy as np
from typing import Dict, Any, List, Optional, Union
import struct
from pathlib import Path
import zipfile
from gguf_writer import GGUFWriter, GGMLType, GGUFValueType
def should_skip_tensor(name: str) -> bool:
skip_patterns = [
'embedding_projection.projection.weight',
'embedding_projection.projection.bias',
]
if name in skip_patterns:
return True
for pattern in skip_patterns:
if pattern in name:
return True
return False
def convert_parameter_name_to_gguf(pytorch_name: str) -> str:
name_mapping = {
'embed_tokens.weight': 'token_embd.weight',
'norm.weight': 'output_norm.weight',
'lm_head.weight': 'output.weight',
}
if 'layers.' in pytorch_name:
parts = pytorch_name.split('.')
if len(parts) < 3:
return pytorch_name
layer_num = parts[1]
rest = '.'.join(parts[2:])
layer_mappings = {
'input_layernorm.weight': f'blk.{layer_num}.attn_norm.weight',
'post_attention_layernorm.weight': f'blk.{layer_num}.ffn_norm.weight',
'self_attn.q_proj.weight': f'blk.{layer_num}.attn_q.weight',
'self_attn.k_proj.weight': f'blk.{layer_num}.attn_k.weight',
'self_attn.v_proj.weight': f'blk.{layer_num}.attn_v.weight',
'self_attn.o_proj.weight': f'blk.{layer_num}.attn_output.weight',
'mlp.gate_proj.weight': f'blk.{layer_num}.ffn_gate.weight',
'mlp.up_proj.weight': f'blk.{layer_num}.ffn_up.weight',
'mlp.down_proj.weight': f'blk.{layer_num}.ffn_down.weight',
}
if rest in layer_mappings:
return layer_mappings[rest]
return name_mapping.get(pytorch_name, pytorch_name)
def should_transpose_tensor(gguf_name: str, tensor_shape: tuple) -> bool:
transpose_patterns = {
'token_embd.weight': True,
'output.weight': True,
}
if gguf_name in transpose_patterns:
return transpose_patterns[gguf_name]
if any(pattern in gguf_name for pattern in ['.attn_q.weight', '.attn_k.weight', '.attn_v.weight',
'.attn_output.weight', '.ffn_gate.weight',
'.ffn_up.weight', '.ffn_down.weight']):
if len(tensor_shape) == 2:
return True
return False
def create_dummy_output_tensor(vocab_size: int, hidden_size: int) -> torch.Tensor:
print(f"Creating dummy output.weight tensor with shape [{vocab_size}, {hidden_size}]")
dummy_tensor = torch.randn(vocab_size, hidden_size, dtype=torch.float32) * 0.02
return dummy_tensor
def debug_model_tensors(model):
print("=== MODEL TENSOR DEBUG ===")
tensors_to_process = []
tensors_to_skip = []
for i, (name, param) in enumerate(model.named_parameters()):
if should_skip_tensor(name):
tensors_to_skip.append((i+1, name, param.shape, param.numel()))
else:
gguf_name = convert_parameter_name_to_gguf(name)
tensors_to_process.append((i+1, name, gguf_name, param.shape, param.numel()))
print("TENSORS TO PROCESS:")
for i, name, gguf_name, shape, numel in tensors_to_process:
print(f"{i:2d}. {name}")
print(f" -> {gguf_name}")
print(f" Shape: {shape}, Elements: {numel}")
if numel == 0:
print(f" *** EMPTY TENSOR ***")
print()
if tensors_to_skip:
print("TENSORS TO SKIP:")
for i, name, shape, numel in tensors_to_skip:
print(f"{i:2d}. {name} (SKIPPED)")
print(f" Shape: {shape}, Elements: {numel}")
print()
print(f"Total tensors in model: {len(list(model.named_parameters()))}")
print(f"Tensors to process: {len(tensors_to_process)}")
print(f"Tensors to skip: {len(tensors_to_skip)}")
print("=========================")
def convert_pytorch_to_gguf(model, tokenizer, output_path: str, quantization: str = "f32", progress_callback=None) -> bool:
try:
debug_model_tensors(model)
writer = GGUFWriter(output_path, "llama")
required_attrs = ['vocab_size', 'hidden_size', 'num_hidden_layers', 'num_attention_heads']
for attr in required_attrs:
if not hasattr(model, attr):
raise ValueError(f"Model missing required attribute: {attr}")
def get_safe_attr(obj, attr, default):
return getattr(obj, attr, default)
vocab_size = get_safe_attr(model, 'vocab_size', 32000)
hidden_size = get_safe_attr(model, 'hidden_size', 4096)
num_layers = get_safe_attr(model, 'num_hidden_layers', 32)
num_heads = get_safe_attr(model, 'num_attention_heads', 32)
num_kv_heads = get_safe_attr(model, 'num_key_value_heads', num_heads)
intermediate_size = get_safe_attr(model, 'intermediate_size', 11008)
max_position_embeddings = get_safe_attr(model, 'max_position_embeddings', 2048)
rms_norm_eps = get_safe_attr(model, 'rms_norm_eps', 1e-6)
rope_theta = get_safe_attr(model, 'rope_theta', 10000.0)
writer.add_metadata("general.architecture", "llama", GGUFValueType.STRING)
writer.add_metadata("general.name", "custom-llama-model", GGUFValueType.STRING)
writer.add_metadata("general.version", "1.0", GGUFValueType.STRING)
writer.add_metadata("general.description", "Custom trained GGUF model", GGUFValueType.STRING)
writer.add_metadata("llama.vocab_size", vocab_size, GGUFValueType.UINT32)
writer.add_metadata("llama.context_length", max_position_embeddings, GGUFValueType.UINT32)
writer.add_metadata("llama.embedding_length", hidden_size, GGUFValueType.UINT32)
writer.add_metadata("llama.block_count", num_layers, GGUFValueType.UINT32)
writer.add_metadata("llama.feed_forward_length", intermediate_size, GGUFValueType.UINT32)
writer.add_metadata("llama.attention.head_count", num_heads, GGUFValueType.UINT32)
writer.add_metadata("llama.attention.head_count_kv", num_kv_heads, GGUFValueType.UINT32)
writer.add_metadata("llama.attention.layer_norm_rms_epsilon", float(rms_norm_eps), GGUFValueType.FLOAT32)
writer.add_metadata("llama.rope.freq_base", float(rope_theta), GGUFValueType.FLOAT32)
# FIXED TOKENIZER EXPORT - Preserve token order and IDs
try:
# Get vocabulary with proper ordering
if hasattr(tokenizer, 'get_vocab'):
vocab = tokenizer.get_vocab()
else:
vocab = tokenizer.vocab
# Create ordered list of tokens by their IDs
vocab_size_actual = len(vocab)
tokens = [''] * vocab_size_actual
# Fill tokens array in correct order by ID
for token, token_id in vocab.items():
if 0 <= token_id < vocab_size_actual:
tokens[token_id] = token
else:
print(f"Warning: Token '{token}' has ID {token_id} outside vocab range [0, {vocab_size_actual})")
# Handle any missing tokens (fill with placeholder)
for i in range(vocab_size_actual):
if tokens[i] == '':
tokens[i] = f'<UNK_{i}>'
print(f"Warning: Missing token at ID {i}, using placeholder")
print(f"Tokenizer vocab size: {vocab_size_actual}")
print(f"First 10 tokens: {tokens[:10]}")
print(f"Last 10 tokens: {tokens[-10:]}")
# Verify special tokens
bos_id = getattr(tokenizer, 'bos_token_id', None)
eos_id = getattr(tokenizer, 'eos_token_id', None)
pad_id = getattr(tokenizer, 'pad_token_id', None)
unk_id = getattr(tokenizer, 'unk_token_id', None)
if bos_id is not None and 0 <= bos_id < len(tokens):
print(f"BOS token ID {bos_id}: '{tokens[bos_id]}'")
if eos_id is not None and 0 <= eos_id < len(tokens):
print(f"EOS token ID {eos_id}: '{tokens[eos_id]}'")
if pad_id is not None and 0 <= pad_id < len(tokens):
print(f"PAD token ID {pad_id}: '{tokens[pad_id]}'")
if unk_id is not None and 0 <= unk_id < len(tokens):
print(f"UNK token ID {unk_id}: '{tokens[unk_id]}'")
# Add tokenizer metadata with properly ordered tokens
writer.add_metadata("tokenizer.ggml.model", "llama", GGUFValueType.STRING)
writer.add_metadata("tokenizer.ggml.tokens", tokens, GGUFValueType.ARRAY)
# Create scores array (all zeros for now)
token_scores = [0.0] * len(tokens)
writer.add_metadata("tokenizer.ggml.scores", token_scores, GGUFValueType.ARRAY)
# Add token types (all normal tokens for now)
token_types = [1] * len(tokens) # 1 = normal token, 2 = unknown, 3 = control, etc.
writer.add_metadata("tokenizer.ggml.token_type", token_types, GGUFValueType.ARRAY)
# Add special token IDs with proper fallbacks
bos_id = bos_id if bos_id is not None else 1
eos_id = eos_id if eos_id is not None else 2
pad_id = pad_id if pad_id is not None else eos_id
unk_id = unk_id if unk_id is not None else 0
writer.add_metadata("tokenizer.ggml.bos_token_id", int(bos_id), GGUFValueType.UINT32)
writer.add_metadata("tokenizer.ggml.eos_token_id", int(eos_id), GGUFValueType.UINT32)
writer.add_metadata("tokenizer.ggml.pad_token_id", int(pad_id), GGUFValueType.UINT32)
writer.add_metadata("tokenizer.ggml.unk_token_id", int(unk_id), GGUFValueType.UINT32)
# Add additional special tokens if they exist
special_tokens = {}
if hasattr(tokenizer, 'added_tokens_encoder'):
for token, token_id in tokenizer.added_tokens_encoder.items():
special_tokens[token] = token_id
print(f"Special token '{token}': ID {token_id}")
print(f"Successfully exported tokenizer with {len(tokens)} tokens")
except Exception as e:
print(f"Warning: Error adding tokenizer metadata: {e}")
print("Falling back to basic tokenizer metadata")
writer.add_metadata("tokenizer.ggml.model", "llama", GGUFValueType.STRING)
# Create minimal tokenizer data
basic_tokens = [f"<token_{i}>" for i in range(vocab_size)]
writer.add_metadata("tokenizer.ggml.tokens", basic_tokens, GGUFValueType.ARRAY)
basic_scores = [0.0] * vocab_size
writer.add_metadata("tokenizer.ggml.scores", basic_scores, GGUFValueType.ARRAY)
tensor_type_map = {
"f16": GGMLType.F16,
"q8_0": GGMLType.Q8_0,
"q4_0": GGMLType.Q4_0,
"f32": GGMLType.F32
}
tensor_type = tensor_type_map.get(quantization, GGMLType.F32)
valid_tensors = [(name, param) for name, param in model.named_parameters() if not should_skip_tensor(name)]
total_tensors = len(valid_tensors)
print(f"Processing {total_tensors} tensors (after filtering)...")
tensors_processed = 0
output_tensor_found = False
for i, (name, param) in enumerate(valid_tensors):
try:
gguf_name = convert_parameter_name_to_gguf(name)
print(f"Processing tensor {i+1}/{total_tensors}: {name} -> {gguf_name}")
print(f" Shape: {param.shape}, Elements: {param.numel()}")
tensor_data = param.detach().cpu()
if gguf_name == 'output.weight':
output_tensor_found = True
try:
if should_transpose_tensor(gguf_name, tensor_data.shape):
print(f" Transposing output tensor: {tensor_data.shape} -> {tensor_data.T.shape}")
tensor_data = tensor_data.T
writer.add_tensor(gguf_name, tensor_data, tensor_type)
print(f" Successfully processed output.weight tensor")
except Exception as e:
print(f" Error processing output.weight from lm_head.weight: {e}")
print(f" Creating dummy output.weight tensor")
dummy_tensor = create_dummy_output_tensor(vocab_size, hidden_size)
if should_transpose_tensor(gguf_name, dummy_tensor.shape):
dummy_tensor = dummy_tensor.T
writer.add_tensor(gguf_name, dummy_tensor, tensor_type)
print(f" Successfully added dummy output.weight tensor")
else:
if should_transpose_tensor(gguf_name, tensor_data.shape):
print(f" Transposing tensor: {tensor_data.shape} -> {tensor_data.T.shape}")
tensor_data = tensor_data.T
writer.add_tensor(gguf_name, tensor_data, tensor_type)
tensors_processed += 1
if progress_callback:
progress_callback((i + 1) / total_tensors)
except Exception as e:
print(f"Error processing parameter {name}: {e}")
if gguf_name == 'output.weight':
print(f"Creating dummy output.weight tensor as fallback")
try:
dummy_tensor = create_dummy_output_tensor(vocab_size, hidden_size)
if should_transpose_tensor(gguf_name, dummy_tensor.shape):
dummy_tensor = dummy_tensor.T
writer.add_tensor(gguf_name, dummy_tensor, tensor_type)
output_tensor_found = True
tensors_processed += 1
print(f"Successfully added dummy output.weight tensor")
except Exception as dummy_e:
print(f"Failed to create dummy output.weight tensor: {dummy_e}")
raise e
else:
raise e
if not output_tensor_found:
print("output.weight tensor not found in model, creating dummy tensor")
try:
dummy_tensor = create_dummy_output_tensor(vocab_size, hidden_size)
if should_transpose_tensor('output.weight', dummy_tensor.shape):
dummy_tensor = dummy_tensor.T
writer.add_tensor('output.weight', dummy_tensor, tensor_type)
tensors_processed += 1
print("Successfully added dummy output.weight tensor")
except Exception as e:
print(f"Failed to create dummy output.weight tensor: {e}")
raise Exception("Critical: Could not create output.weight tensor")
print(f"Successfully processed {tensors_processed} tensors")
print("Writing GGUF file...")
writer.write()
print("GGUF file written successfully!")
return True
except Exception as e:
raise Exception(f"Error converting to GGUF: {e}")
def get_model_info(model) -> Dict[str, Any]:
return {
'total_parameters': sum(p.numel() for p in model.parameters()),
'vocab_size': getattr(model, 'vocab_size', 'unknown'),
'hidden_size': getattr(model, 'hidden_size', 'unknown'),
'num_layers': getattr(model, 'num_hidden_layers', 'unknown'),
'num_attention_heads': getattr(model, 'num_attention_heads', 'unknown'),
'num_key_value_heads': getattr(model, 'num_key_value_heads', 'unknown'),
'intermediate_size': getattr(model, 'intermediate_size', 'unknown'),
'max_position_embeddings': getattr(model, 'max_position_embeddings', 'unknown')
}
def create_config_file(model, quantization: str, training_stats: Optional[Dict] = None) -> Dict[str, Any]:
config_content = {
'model_config': getattr(model, 'config', {}),
'quantization': quantization,
'training_stats': training_stats
}
return config_content
def create_model_package(model, tokenizer, model_name: str, quantization: str,
include_config: bool, temp_dir: Path,
training_stats: Optional[Dict] = None,
progress_callback=None) -> List[Path]:
files_created = []
gguf_path = temp_dir / f"{model_name}.gguf"
success = convert_pytorch_to_gguf(
model, tokenizer, str(gguf_path), quantization, progress_callback
)
if not success:
raise Exception("Failed to convert model to GGUF format")
files_created.append(gguf_path)
if include_config:
config_path = temp_dir / f"{model_name}_config.json"
config_content = create_config_file(model, quantization, training_stats)
config_path.write_text(json.dumps(config_content, indent=2, default=str), encoding='utf-8')
files_created.append(config_path)
return files_created
def create_zip_package(files: List[Path], output_path: Path) -> None:
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in files:
zipf.write(file_path, file_path.name)
def get_quantization_info() -> Dict[str, str]:
return {
"f32": "Full precision (largest file, best quality)",
"f16": "Half precision (good balance)",
"q8_0": "8-bit quantization (smaller, slight quality loss)",
"q4_0": "4-bit quantization (smallest, more quality loss)"
}
def test_model_generation(model, tokenizer, generate_function):
test_prompt = "<|begin_of_text|><|start_header_id|>start<|end_header_id|>\nHello Monika\n<|start_header_id|>end<|end_header_id|>\n\n<|start_header_id|>reply<|end_header_id|>\n"
generated_text = generate_function(
model, tokenizer, test_prompt, max_length=20
)
return test_prompt, generated_text
gguf_writer.py
import torch
import numpy as np
from typing import Dict, Any, List, Optional, Union
import struct
GGUF_MAGIC = 0x46554747
GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32
class GGMLType:
F32 = 0
F16 = 1
Q4_0 = 2
Q4_1 = 3
Q5_0 = 6
Q5_1 = 7
Q8_0 = 8
Q8_1 = 9
Q2_K = 10
Q3_K = 11
Q4_K = 12
Q5_K = 13
Q6_K = 14
Q8_K = 15
class GGUFValueType:
UINT8 = 0
INT8 = 1
UINT16 = 2
INT16 = 3
UINT32 = 4
INT32 = 5
FLOAT32 = 6
BOOL = 7
STRING = 8
ARRAY = 9
UINT64 = 10
INT64 = 11
FLOAT64 = 12
class GGUFWriter:
def __init__(self, path: str, arch: str):
self.path = path
self.arch = arch
self.metadata = {}
self.tensors = []
self.tensor_data = {}
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
def add_metadata(self, key: str, value: Any, value_type: int):
if not isinstance(key, str):
raise ValueError(f"Metadata key must be string, got {type(key)}")
self.metadata[key] = (value, value_type)
def add_tensor(self, name: str, tensor: Union[torch.Tensor, np.ndarray], tensor_type: int = GGMLType.F32):
if not isinstance(name, str):
raise ValueError(f"Tensor name must be string, got {type(name)}")
if isinstance(tensor, torch.Tensor):
tensor_np = tensor.detach().cpu().numpy()
else:
tensor_np = np.array(tensor)
if tensor_np.size == 0:
print(f"Warning: Tensor {name} is empty, creating minimal tensor")
tensor_np = np.array([0.0], dtype=np.float32)
if tensor_type == GGMLType.F32:
tensor_np = tensor_np.astype(np.float32)
elif tensor_type == GGMLType.F16:
tensor_np = tensor_np.astype(np.float16)
else:
tensor_np = tensor_np.astype(np.float32)
tensor_np = np.ascontiguousarray(tensor_np)
if len(tensor_np.shape) == 0:
tensor_np = tensor_np.reshape(1)
self.tensors.append({
'name': name,
'shape': list(tensor_np.shape),
'type': tensor_type,
'offset': 0
})
self.tensor_data[name] = tensor_np
def _align_to(self, value: int, alignment: int) -> int:
remainder = value % alignment
if remainder == 0:
return value
return value + alignment - remainder
def _write_string(self, f, s: str):
if not isinstance(s, str):
s = str(s)
encoded = s.encode('utf-8')
f.write(struct.pack('<Q', len(encoded)))
f.write(encoded)
def _write_array_value(self, f, value: Any, elem_type: int):
if elem_type == GGUFValueType.STRING:
if not isinstance(value, str):
value = str(value)
self._write_string(f, value)
elif elem_type == GGUFValueType.UINT32:
f.write(struct.pack('<I', int(value)))
elif elem_type == GGUFValueType.INT32:
f.write(struct.pack('<i', int(value)))
elif elem_type == GGUFValueType.FLOAT32:
f.write(struct.pack('<f', float(value)))
elif elem_type == GGUFValueType.UINT64:
f.write(struct.pack('<Q', int(value)))
elif elem_type == GGUFValueType.INT64:
f.write(struct.pack('<q', int(value)))
elif elem_type == GGUFValueType.FLOAT64:
f.write(struct.pack('<d', float(value)))
elif elem_type == GGUFValueType.BOOL:
f.write(struct.pack('<B', 1 if value else 0))
else:
raise ValueError(f"Unsupported array element type: {elem_type}")
def _write_metadata_value(self, f, value: Any, value_type: int):
f.write(struct.pack('<I', value_type))
if value_type == GGUFValueType.STRING:
self._write_string(f, str(value))
elif value_type == GGUFValueType.UINT32:
f.write(struct.pack('<I', int(value)))
elif value_type == GGUFValueType.INT32:
f.write(struct.pack('<i', int(value)))
elif value_type == GGUFValueType.UINT64:
f.write(struct.pack('<Q', int(value)))
elif value_type == GGUFValueType.INT64:
f.write(struct.pack('<q', int(value)))
elif value_type == GGUFValueType.FLOAT32:
f.write(struct.pack('<f', float(value)))
elif value_type == GGUFValueType.FLOAT64:
f.write(struct.pack('<d', float(value)))
elif value_type == GGUFValueType.BOOL:
f.write(struct.pack('<B', 1 if value else 0))
elif value_type == GGUFValueType.ARRAY:
if not isinstance(value, (list, tuple)):
raise ValueError(f"Array value must be list or tuple, got {type(value)}")
if len(value) == 0:
f.write(struct.pack('<I', GGUFValueType.STRING))
f.write(struct.pack('<Q', 0))
else:
first_elem = value[0]
if isinstance(first_elem, str):
elem_type = GGUFValueType.STRING
elif isinstance(first_elem, bool):
elem_type = GGUFValueType.BOOL
elif isinstance(first_elem, float):
elem_type = GGUFValueType.FLOAT32
elif isinstance(first_elem, int):
if all(-2147483648 <= x <= 2147483647 for x in value if isinstance(x, int)):
elem_type = GGUFValueType.INT32
else:
elem_type = GGUFValueType.INT64
else:
elem_type = GGUFValueType.STRING
f.write(struct.pack('<I', elem_type))
f.write(struct.pack('<Q', len(value)))
for item in value:
self._write_array_value(f, item, elem_type)
else:
raise ValueError(f"Unsupported metadata value type: {value_type}")
def write(self):
try:
with open(self.path, 'wb') as f:
f.write(struct.pack('<I', GGUF_MAGIC))
f.write(struct.pack('<I', GGUF_VERSION))
f.write(struct.pack('<Q', len(self.tensors)))
f.write(struct.pack('<Q', len(self.metadata)))
for key, (value, value_type) in self.metadata.items():
self._write_string(f, key)
self._write_metadata_value(f, value, value_type)
current_offset = 0
for tensor_info in self.tensors:
tensor_name = tensor_info['name']
tensor_data = self.tensor_data[tensor_name]
tensor_size = tensor_data.nbytes
tensor_info['offset'] = current_offset
current_offset += tensor_size
current_offset = self._align_to(current_offset, self.data_alignment)
for tensor_info in self.tensors:
self._write_string(f, tensor_info['name'])
f.write(struct.pack('<I', len(tensor_info['shape'])))
for dim in tensor_info['shape']:
f.write(struct.pack('<Q', int(dim)))
f.write(struct.pack('<I', tensor_info['type']))
f.write(struct.pack('<Q', tensor_info['offset']))
current_pos = f.tell()
aligned_pos = self._align_to(current_pos, self.data_alignment)
if aligned_pos > current_pos:
f.write(b'\x00' * (aligned_pos - current_pos))
for tensor_info in self.tensors:
tensor_name = tensor_info['name']
tensor_data = self.tensor_data[tensor_name]
f.write(tensor_data.tobytes())
current_pos = f.tell()
aligned_pos = self._align_to(current_pos, self.data_alignment)
if aligned_pos > current_pos:
f.write(b'\x00' * (aligned_pos - current_pos))
except Exception as e:
raise Exception(f"Error writing GGUF file: {e}")
Hello, apologies for the really late reply.
From experience, usually updating gguf library fixes the errors I've had but not sure for this case. Also can try re-exporting since maybe something went wrong with the process (although the issue could be with the lib).
To be honest might be best to ask on the text-generation-webui or llama cpp github if you haven't already.
Again, sorry for the really late reply. Hopefully you were able to solve or find a way around it by now