import torch from transformers import AutoModelForCausalLM, AutoTokenizer from safetensors.torch import load_file from accelerate import init_empty_weights, load_checkpoint_and_dispatch # Specify the model name and safetensors file path MODEL_NAME = "mistral-8x7B" SAFETENSORS_PATH = "path_to_your_model.safetensors" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Initialize an empty model (no weights loaded yet) with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) # Load the model weights from the safetensors file model_weights = load_file(SAFETENSORS_PATH) # Use Hugging Face's `accelerate` to load the model efficiently # This allows for sharding and offloading to CPU/disk if needed model = load_checkpoint_and_dispatch( model, SAFETENSORS_PATH, device_map="auto", # Automatically handles GPU/CPU offloading no_split_module_classes=["MistralLayer"], # Specify layers not to split dtype=torch.float16, # Use mixed precision for memory efficiency ) # Move the model to the appropriate device device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Example usage input_text = "Hello, how are you?" inputs = tokenizer(input_text, return_tensors="pt").to(device) # Generate output with efficient memory usage with torch.no_grad(): outputs = model.generate( inputs["input_ids"], max_length=50, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.95, ) # Decode and print the output generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print("Generated Text:", generated_text)