Please, help me with GGUF coding!

#1
by SmilleCreeper - opened

Hello, I'm developing my own Streamlit app to train language models from scratch. It works well, but when I export model as GGUF and load it in text-generation-webui and send a message, it can't tokenize, and shows error llama.cpp error: 'invalid unordered_map<K, T> key in console. Can you with experience with GGUF help me? I'm getting desperate.

export_backend.py

import torch
import json
import numpy as np
from typing import Dict, Any, List, Optional, Union
import struct
from pathlib import Path
import zipfile
from gguf_writer import GGUFWriter, GGMLType, GGUFValueType

def should_skip_tensor(name: str) -> bool:
    skip_patterns = [
        'embedding_projection.projection.weight',
        'embedding_projection.projection.bias',
    ]
    
    if name in skip_patterns:
        return True
    
    for pattern in skip_patterns:
        if pattern in name:
            return True
    
    return False

def convert_parameter_name_to_gguf(pytorch_name: str) -> str:
    name_mapping = {
        'embed_tokens.weight': 'token_embd.weight',
        'norm.weight': 'output_norm.weight',
        'lm_head.weight': 'output.weight',
    }
    
    if 'layers.' in pytorch_name:
        parts = pytorch_name.split('.')
        if len(parts) < 3:
            return pytorch_name
        
        layer_num = parts[1]
        rest = '.'.join(parts[2:])
        
        layer_mappings = {
            'input_layernorm.weight': f'blk.{layer_num}.attn_norm.weight',
            'post_attention_layernorm.weight': f'blk.{layer_num}.ffn_norm.weight',
            'self_attn.q_proj.weight': f'blk.{layer_num}.attn_q.weight',
            'self_attn.k_proj.weight': f'blk.{layer_num}.attn_k.weight',
            'self_attn.v_proj.weight': f'blk.{layer_num}.attn_v.weight',
            'self_attn.o_proj.weight': f'blk.{layer_num}.attn_output.weight',
            'mlp.gate_proj.weight': f'blk.{layer_num}.ffn_gate.weight',
            'mlp.up_proj.weight': f'blk.{layer_num}.ffn_up.weight',
            'mlp.down_proj.weight': f'blk.{layer_num}.ffn_down.weight',
        }
        
        if rest in layer_mappings:
            return layer_mappings[rest]
    
    return name_mapping.get(pytorch_name, pytorch_name)

def should_transpose_tensor(gguf_name: str, tensor_shape: tuple) -> bool:
    transpose_patterns = {
        'token_embd.weight': True,
        'output.weight': True,
    }
    
    if gguf_name in transpose_patterns:
        return transpose_patterns[gguf_name]
    
    if any(pattern in gguf_name for pattern in ['.attn_q.weight', '.attn_k.weight', '.attn_v.weight', 
                                                '.attn_output.weight', '.ffn_gate.weight', 
                                                '.ffn_up.weight', '.ffn_down.weight']):
        if len(tensor_shape) == 2:
            return True
    
    return False

def create_dummy_output_tensor(vocab_size: int, hidden_size: int) -> torch.Tensor:
    print(f"Creating dummy output.weight tensor with shape [{vocab_size}, {hidden_size}]")
    dummy_tensor = torch.randn(vocab_size, hidden_size, dtype=torch.float32) * 0.02
    return dummy_tensor

def debug_model_tensors(model):
    print("=== MODEL TENSOR DEBUG ===")
    tensors_to_process = []
    tensors_to_skip = []
    
    for i, (name, param) in enumerate(model.named_parameters()):
        if should_skip_tensor(name):
            tensors_to_skip.append((i+1, name, param.shape, param.numel()))
        else:
            gguf_name = convert_parameter_name_to_gguf(name)
            tensors_to_process.append((i+1, name, gguf_name, param.shape, param.numel()))
    
    print("TENSORS TO PROCESS:")
    for i, name, gguf_name, shape, numel in tensors_to_process:
        print(f"{i:2d}. {name}")
        print(f"    -> {gguf_name}")
        print(f"    Shape: {shape}, Elements: {numel}")
        if numel == 0:
            print(f"    *** EMPTY TENSOR ***")
        print()
    
    if tensors_to_skip:
        print("TENSORS TO SKIP:")
        for i, name, shape, numel in tensors_to_skip:
            print(f"{i:2d}. {name} (SKIPPED)")
            print(f"    Shape: {shape}, Elements: {numel}")
            print()
    
    print(f"Total tensors in model: {len(list(model.named_parameters()))}")
    print(f"Tensors to process: {len(tensors_to_process)}")
    print(f"Tensors to skip: {len(tensors_to_skip)}")
    print("=========================")

def convert_pytorch_to_gguf(model, tokenizer, output_path: str, quantization: str = "f32", progress_callback=None) -> bool:
    try:
        debug_model_tensors(model)
        
        writer = GGUFWriter(output_path, "llama")
        
        required_attrs = ['vocab_size', 'hidden_size', 'num_hidden_layers', 'num_attention_heads']
        for attr in required_attrs:
            if not hasattr(model, attr):
                raise ValueError(f"Model missing required attribute: {attr}")
        
        def get_safe_attr(obj, attr, default):
            return getattr(obj, attr, default)
        
        vocab_size = get_safe_attr(model, 'vocab_size', 32000)
        hidden_size = get_safe_attr(model, 'hidden_size', 4096)
        num_layers = get_safe_attr(model, 'num_hidden_layers', 32)
        num_heads = get_safe_attr(model, 'num_attention_heads', 32)
        num_kv_heads = get_safe_attr(model, 'num_key_value_heads', num_heads)
        intermediate_size = get_safe_attr(model, 'intermediate_size', 11008)
        max_position_embeddings = get_safe_attr(model, 'max_position_embeddings', 2048)
        rms_norm_eps = get_safe_attr(model, 'rms_norm_eps', 1e-6)
        rope_theta = get_safe_attr(model, 'rope_theta', 10000.0)
        
        writer.add_metadata("general.architecture", "llama", GGUFValueType.STRING)
        writer.add_metadata("general.name", "custom-llama-model", GGUFValueType.STRING)
        writer.add_metadata("general.version", "1.0", GGUFValueType.STRING)
        writer.add_metadata("general.description", "Custom trained GGUF model", GGUFValueType.STRING)
        
        writer.add_metadata("llama.vocab_size", vocab_size, GGUFValueType.UINT32)
        writer.add_metadata("llama.context_length", max_position_embeddings, GGUFValueType.UINT32)
        writer.add_metadata("llama.embedding_length", hidden_size, GGUFValueType.UINT32)
        writer.add_metadata("llama.block_count", num_layers, GGUFValueType.UINT32)
        writer.add_metadata("llama.feed_forward_length", intermediate_size, GGUFValueType.UINT32)
        writer.add_metadata("llama.attention.head_count", num_heads, GGUFValueType.UINT32)
        writer.add_metadata("llama.attention.head_count_kv", num_kv_heads, GGUFValueType.UINT32)
        writer.add_metadata("llama.attention.layer_norm_rms_epsilon", float(rms_norm_eps), GGUFValueType.FLOAT32)
        writer.add_metadata("llama.rope.freq_base", float(rope_theta), GGUFValueType.FLOAT32)
        
        # FIXED TOKENIZER EXPORT - Preserve token order and IDs
        try:
            # Get vocabulary with proper ordering
            if hasattr(tokenizer, 'get_vocab'):
                vocab = tokenizer.get_vocab()
            else:
                vocab = tokenizer.vocab
            
            # Create ordered list of tokens by their IDs
            vocab_size_actual = len(vocab)
            tokens = [''] * vocab_size_actual
            
            # Fill tokens array in correct order by ID
            for token, token_id in vocab.items():
                if 0 <= token_id < vocab_size_actual:
                    tokens[token_id] = token
                else:
                    print(f"Warning: Token '{token}' has ID {token_id} outside vocab range [0, {vocab_size_actual})")
            
            # Handle any missing tokens (fill with placeholder)
            for i in range(vocab_size_actual):
                if tokens[i] == '':
                    tokens[i] = f'<UNK_{i}>'
                    print(f"Warning: Missing token at ID {i}, using placeholder")
            
            print(f"Tokenizer vocab size: {vocab_size_actual}")
            print(f"First 10 tokens: {tokens[:10]}")
            print(f"Last 10 tokens: {tokens[-10:]}")
            
            # Verify special tokens
            bos_id = getattr(tokenizer, 'bos_token_id', None)
            eos_id = getattr(tokenizer, 'eos_token_id', None)
            pad_id = getattr(tokenizer, 'pad_token_id', None)
            unk_id = getattr(tokenizer, 'unk_token_id', None)
            
            if bos_id is not None and 0 <= bos_id < len(tokens):
                print(f"BOS token ID {bos_id}: '{tokens[bos_id]}'")
            if eos_id is not None and 0 <= eos_id < len(tokens):
                print(f"EOS token ID {eos_id}: '{tokens[eos_id]}'")
            if pad_id is not None and 0 <= pad_id < len(tokens):
                print(f"PAD token ID {pad_id}: '{tokens[pad_id]}'")
            if unk_id is not None and 0 <= unk_id < len(tokens):
                print(f"UNK token ID {unk_id}: '{tokens[unk_id]}'")
            
            # Add tokenizer metadata with properly ordered tokens
            writer.add_metadata("tokenizer.ggml.model", "llama", GGUFValueType.STRING)
            writer.add_metadata("tokenizer.ggml.tokens", tokens, GGUFValueType.ARRAY)
            
            # Create scores array (all zeros for now)
            token_scores = [0.0] * len(tokens)
            writer.add_metadata("tokenizer.ggml.scores", token_scores, GGUFValueType.ARRAY)
            
            # Add token types (all normal tokens for now)
            token_types = [1] * len(tokens)  # 1 = normal token, 2 = unknown, 3 = control, etc.
            writer.add_metadata("tokenizer.ggml.token_type", token_types, GGUFValueType.ARRAY)
            
            # Add special token IDs with proper fallbacks
            bos_id = bos_id if bos_id is not None else 1
            eos_id = eos_id if eos_id is not None else 2
            pad_id = pad_id if pad_id is not None else eos_id
            unk_id = unk_id if unk_id is not None else 0
            
            writer.add_metadata("tokenizer.ggml.bos_token_id", int(bos_id), GGUFValueType.UINT32)
            writer.add_metadata("tokenizer.ggml.eos_token_id", int(eos_id), GGUFValueType.UINT32)
            writer.add_metadata("tokenizer.ggml.pad_token_id", int(pad_id), GGUFValueType.UINT32)
            writer.add_metadata("tokenizer.ggml.unk_token_id", int(unk_id), GGUFValueType.UINT32)
            
            # Add additional special tokens if they exist
            special_tokens = {}
            if hasattr(tokenizer, 'added_tokens_encoder'):
                for token, token_id in tokenizer.added_tokens_encoder.items():
                    special_tokens[token] = token_id
                    print(f"Special token '{token}': ID {token_id}")
            
            print(f"Successfully exported tokenizer with {len(tokens)} tokens")
            
        except Exception as e:
            print(f"Warning: Error adding tokenizer metadata: {e}")
            print("Falling back to basic tokenizer metadata")
            writer.add_metadata("tokenizer.ggml.model", "llama", GGUFValueType.STRING)
            # Create minimal tokenizer data
            basic_tokens = [f"<token_{i}>" for i in range(vocab_size)]
            writer.add_metadata("tokenizer.ggml.tokens", basic_tokens, GGUFValueType.ARRAY)
            basic_scores = [0.0] * vocab_size
            writer.add_metadata("tokenizer.ggml.scores", basic_scores, GGUFValueType.ARRAY)
        
        tensor_type_map = {
            "f16": GGMLType.F16,
            "q8_0": GGMLType.Q8_0,
            "q4_0": GGMLType.Q4_0,
            "f32": GGMLType.F32
        }
        tensor_type = tensor_type_map.get(quantization, GGMLType.F32)
        
        valid_tensors = [(name, param) for name, param in model.named_parameters() if not should_skip_tensor(name)]
        total_tensors = len(valid_tensors)
        
        print(f"Processing {total_tensors} tensors (after filtering)...")
        
        tensors_processed = 0
        output_tensor_found = False
        
        for i, (name, param) in enumerate(valid_tensors):
            try:
                gguf_name = convert_parameter_name_to_gguf(name)
                
                print(f"Processing tensor {i+1}/{total_tensors}: {name} -> {gguf_name}")
                print(f"  Shape: {param.shape}, Elements: {param.numel()}")
                
                tensor_data = param.detach().cpu()
                
                if gguf_name == 'output.weight':
                    output_tensor_found = True
                    try:
                        if should_transpose_tensor(gguf_name, tensor_data.shape):
                            print(f"  Transposing output tensor: {tensor_data.shape} -> {tensor_data.T.shape}")
                            tensor_data = tensor_data.T
                        
                        writer.add_tensor(gguf_name, tensor_data, tensor_type)
                        print(f"  Successfully processed output.weight tensor")
                    except Exception as e:
                        print(f"  Error processing output.weight from lm_head.weight: {e}")
                        print(f"  Creating dummy output.weight tensor")
                        dummy_tensor = create_dummy_output_tensor(vocab_size, hidden_size)
                        if should_transpose_tensor(gguf_name, dummy_tensor.shape):
                            dummy_tensor = dummy_tensor.T
                        writer.add_tensor(gguf_name, dummy_tensor, tensor_type)
                        print(f"  Successfully added dummy output.weight tensor")
                else:
                    if should_transpose_tensor(gguf_name, tensor_data.shape):
                        print(f"  Transposing tensor: {tensor_data.shape} -> {tensor_data.T.shape}")
                        tensor_data = tensor_data.T
                    
                    writer.add_tensor(gguf_name, tensor_data, tensor_type)
                
                tensors_processed += 1
                
                if progress_callback:
                    progress_callback((i + 1) / total_tensors)
                    
            except Exception as e:
                print(f"Error processing parameter {name}: {e}")
                if gguf_name == 'output.weight':
                    print(f"Creating dummy output.weight tensor as fallback")
                    try:
                        dummy_tensor = create_dummy_output_tensor(vocab_size, hidden_size)
                        if should_transpose_tensor(gguf_name, dummy_tensor.shape):
                            dummy_tensor = dummy_tensor.T
                        writer.add_tensor(gguf_name, dummy_tensor, tensor_type)
                        output_tensor_found = True
                        tensors_processed += 1
                        print(f"Successfully added dummy output.weight tensor")
                    except Exception as dummy_e:
                        print(f"Failed to create dummy output.weight tensor: {dummy_e}")
                        raise e
                else:
                    raise e
        
        if not output_tensor_found:
            print("output.weight tensor not found in model, creating dummy tensor")
            try:
                dummy_tensor = create_dummy_output_tensor(vocab_size, hidden_size)
                if should_transpose_tensor('output.weight', dummy_tensor.shape):
                    dummy_tensor = dummy_tensor.T
                writer.add_tensor('output.weight', dummy_tensor, tensor_type)
                tensors_processed += 1
                print("Successfully added dummy output.weight tensor")
            except Exception as e:
                print(f"Failed to create dummy output.weight tensor: {e}")
                raise Exception("Critical: Could not create output.weight tensor")
        
        print(f"Successfully processed {tensors_processed} tensors")
        
        print("Writing GGUF file...")
        writer.write()
        print("GGUF file written successfully!")
        return True
        
    except Exception as e:
        raise Exception(f"Error converting to GGUF: {e}")

def get_model_info(model) -> Dict[str, Any]:
    return {
        'total_parameters': sum(p.numel() for p in model.parameters()),
        'vocab_size': getattr(model, 'vocab_size', 'unknown'),
        'hidden_size': getattr(model, 'hidden_size', 'unknown'),
        'num_layers': getattr(model, 'num_hidden_layers', 'unknown'),
        'num_attention_heads': getattr(model, 'num_attention_heads', 'unknown'),
        'num_key_value_heads': getattr(model, 'num_key_value_heads', 'unknown'),
        'intermediate_size': getattr(model, 'intermediate_size', 'unknown'),
        'max_position_embeddings': getattr(model, 'max_position_embeddings', 'unknown')
    }

def create_config_file(model, quantization: str, training_stats: Optional[Dict] = None) -> Dict[str, Any]:
    config_content = {
        'model_config': getattr(model, 'config', {}),
        'quantization': quantization,
        'training_stats': training_stats
    }
    return config_content

def create_model_package(model, tokenizer, model_name: str, quantization: str, 
                        include_config: bool, temp_dir: Path, 
                        training_stats: Optional[Dict] = None,
                        progress_callback=None) -> List[Path]:
    files_created = []
    
    gguf_path = temp_dir / f"{model_name}.gguf"
    success = convert_pytorch_to_gguf(
        model, tokenizer, str(gguf_path), quantization, progress_callback
    )
    
    if not success:
        raise Exception("Failed to convert model to GGUF format")
    
    files_created.append(gguf_path)
    
    if include_config:
        config_path = temp_dir / f"{model_name}_config.json"
        config_content = create_config_file(model, quantization, training_stats)
        config_path.write_text(json.dumps(config_content, indent=2, default=str), encoding='utf-8')
        files_created.append(config_path)
    
    return files_created

def create_zip_package(files: List[Path], output_path: Path) -> None:
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for file_path in files:
            zipf.write(file_path, file_path.name)

def get_quantization_info() -> Dict[str, str]:
    return {
        "f32": "Full precision (largest file, best quality)",
        "f16": "Half precision (good balance)",
        "q8_0": "8-bit quantization (smaller, slight quality loss)",
        "q4_0": "4-bit quantization (smallest, more quality loss)"
    }

def test_model_generation(model, tokenizer, generate_function):
    test_prompt = "<|begin_of_text|><|start_header_id|>start<|end_header_id|>\nHello Monika\n<|start_header_id|>end<|end_header_id|>\n\n<|start_header_id|>reply<|end_header_id|>\n"
    generated_text = generate_function(
        model, tokenizer, test_prompt, max_length=20
    )
    return test_prompt, generated_text

gguf_writer.py

import torch
import numpy as np
from typing import Dict, Any, List, Optional, Union
import struct

GGUF_MAGIC = 0x46554747
GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32

class GGMLType:
    F32 = 0
    F16 = 1
    Q4_0 = 2
    Q4_1 = 3
    Q5_0 = 6
    Q5_1 = 7
    Q8_0 = 8
    Q8_1 = 9
    Q2_K = 10
    Q3_K = 11
    Q4_K = 12
    Q5_K = 13
    Q6_K = 14
    Q8_K = 15

class GGUFValueType:
    UINT8 = 0
    INT8 = 1
    UINT16 = 2
    INT16 = 3
    UINT32 = 4
    INT32 = 5
    FLOAT32 = 6
    BOOL = 7
    STRING = 8
    ARRAY = 9
    UINT64 = 10
    INT64 = 11
    FLOAT64 = 12

class GGUFWriter:
    def __init__(self, path: str, arch: str):
        self.path = path
        self.arch = arch
        self.metadata = {}
        self.tensors = []
        self.tensor_data = {}
        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
        
    def add_metadata(self, key: str, value: Any, value_type: int):
        if not isinstance(key, str):
            raise ValueError(f"Metadata key must be string, got {type(key)}")
        self.metadata[key] = (value, value_type)
    
    def add_tensor(self, name: str, tensor: Union[torch.Tensor, np.ndarray], tensor_type: int = GGMLType.F32):
        if not isinstance(name, str):
            raise ValueError(f"Tensor name must be string, got {type(name)}")
        
        if isinstance(tensor, torch.Tensor):
            tensor_np = tensor.detach().cpu().numpy()
        else:
            tensor_np = np.array(tensor)
        
        if tensor_np.size == 0:
            print(f"Warning: Tensor {name} is empty, creating minimal tensor")
            tensor_np = np.array([0.0], dtype=np.float32)
        
        if tensor_type == GGMLType.F32:
            tensor_np = tensor_np.astype(np.float32)
        elif tensor_type == GGMLType.F16:
            tensor_np = tensor_np.astype(np.float16)
        else:
            tensor_np = tensor_np.astype(np.float32)
        
        tensor_np = np.ascontiguousarray(tensor_np)
        
        if len(tensor_np.shape) == 0:
            tensor_np = tensor_np.reshape(1)
        
        self.tensors.append({
            'name': name,
            'shape': list(tensor_np.shape),
            'type': tensor_type,
            'offset': 0
        })
        self.tensor_data[name] = tensor_np
    
    def _align_to(self, value: int, alignment: int) -> int:
        remainder = value % alignment
        if remainder == 0:
            return value
        return value + alignment - remainder
    
    def _write_string(self, f, s: str):
        if not isinstance(s, str):
            s = str(s)
        encoded = s.encode('utf-8')
        f.write(struct.pack('<Q', len(encoded)))
        f.write(encoded)
    
    def _write_array_value(self, f, value: Any, elem_type: int):
        if elem_type == GGUFValueType.STRING:
            if not isinstance(value, str):
                value = str(value)
            self._write_string(f, value)
        elif elem_type == GGUFValueType.UINT32:
            f.write(struct.pack('<I', int(value)))
        elif elem_type == GGUFValueType.INT32:
            f.write(struct.pack('<i', int(value)))
        elif elem_type == GGUFValueType.FLOAT32:
            f.write(struct.pack('<f', float(value)))
        elif elem_type == GGUFValueType.UINT64:
            f.write(struct.pack('<Q', int(value)))
        elif elem_type == GGUFValueType.INT64:
            f.write(struct.pack('<q', int(value)))
        elif elem_type == GGUFValueType.FLOAT64:
            f.write(struct.pack('<d', float(value)))
        elif elem_type == GGUFValueType.BOOL:
            f.write(struct.pack('<B', 1 if value else 0))
        else:
            raise ValueError(f"Unsupported array element type: {elem_type}")
    
    def _write_metadata_value(self, f, value: Any, value_type: int):
        f.write(struct.pack('<I', value_type))
        
        if value_type == GGUFValueType.STRING:
            self._write_string(f, str(value))
        elif value_type == GGUFValueType.UINT32:
            f.write(struct.pack('<I', int(value)))
        elif value_type == GGUFValueType.INT32:
            f.write(struct.pack('<i', int(value)))
        elif value_type == GGUFValueType.UINT64:
            f.write(struct.pack('<Q', int(value)))
        elif value_type == GGUFValueType.INT64:
            f.write(struct.pack('<q', int(value)))
        elif value_type == GGUFValueType.FLOAT32:
            f.write(struct.pack('<f', float(value)))
        elif value_type == GGUFValueType.FLOAT64:
            f.write(struct.pack('<d', float(value)))
        elif value_type == GGUFValueType.BOOL:
            f.write(struct.pack('<B', 1 if value else 0))
        elif value_type == GGUFValueType.ARRAY:
            if not isinstance(value, (list, tuple)):
                raise ValueError(f"Array value must be list or tuple, got {type(value)}")
            
            if len(value) == 0:
                f.write(struct.pack('<I', GGUFValueType.STRING))
                f.write(struct.pack('<Q', 0))
            else:
                first_elem = value[0]
                if isinstance(first_elem, str):
                    elem_type = GGUFValueType.STRING
                elif isinstance(first_elem, bool):
                    elem_type = GGUFValueType.BOOL
                elif isinstance(first_elem, float):
                    elem_type = GGUFValueType.FLOAT32
                elif isinstance(first_elem, int):
                    if all(-2147483648 <= x <= 2147483647 for x in value if isinstance(x, int)):
                        elem_type = GGUFValueType.INT32
                    else:
                        elem_type = GGUFValueType.INT64
                else:
                    elem_type = GGUFValueType.STRING
                
                f.write(struct.pack('<I', elem_type))
                f.write(struct.pack('<Q', len(value)))
                
                for item in value:
                    self._write_array_value(f, item, elem_type)
        else:
            raise ValueError(f"Unsupported metadata value type: {value_type}")
    
    def write(self):
        try:
            with open(self.path, 'wb') as f:
                f.write(struct.pack('<I', GGUF_MAGIC))
                f.write(struct.pack('<I', GGUF_VERSION))
                
                f.write(struct.pack('<Q', len(self.tensors)))
                f.write(struct.pack('<Q', len(self.metadata)))
                
                for key, (value, value_type) in self.metadata.items():
                    self._write_string(f, key)
                    self._write_metadata_value(f, value, value_type)
                
                current_offset = 0
                for tensor_info in self.tensors:
                    tensor_name = tensor_info['name']
                    tensor_data = self.tensor_data[tensor_name]
                    tensor_size = tensor_data.nbytes
                    
                    tensor_info['offset'] = current_offset
                    current_offset += tensor_size
                    
                    current_offset = self._align_to(current_offset, self.data_alignment)
                
                for tensor_info in self.tensors:
                    self._write_string(f, tensor_info['name'])
                    
                    f.write(struct.pack('<I', len(tensor_info['shape'])))
                    
                    for dim in tensor_info['shape']:
                        f.write(struct.pack('<Q', int(dim)))
                    
                    f.write(struct.pack('<I', tensor_info['type']))
                    f.write(struct.pack('<Q', tensor_info['offset']))
                
                current_pos = f.tell()
                aligned_pos = self._align_to(current_pos, self.data_alignment)
                if aligned_pos > current_pos:
                    f.write(b'\x00' * (aligned_pos - current_pos))
                
                for tensor_info in self.tensors:
                    tensor_name = tensor_info['name']
                    tensor_data = self.tensor_data[tensor_name]
                    
                    f.write(tensor_data.tobytes())
                    
                    current_pos = f.tell()
                    aligned_pos = self._align_to(current_pos, self.data_alignment)
                    if aligned_pos > current_pos:
                        f.write(b'\x00' * (aligned_pos - current_pos))
                        
        except Exception as e:
            raise Exception(f"Error writing GGUF file: {e}")
922 RSG - CA org

Hello, apologies for the really late reply.

From experience, usually updating gguf library fixes the errors I've had but not sure for this case. Also can try re-exporting since maybe something went wrong with the process (although the issue could be with the lib).

To be honest might be best to ask on the text-generation-webui or llama cpp github if you haven't already.

Again, sorry for the really late reply. Hopefully you were able to solve or find a way around it by now

Sign up or log in to comment