|  | import sys | 
					
						
						|  | import time | 
					
						
						|  | import warnings | 
					
						
						|  | from pathlib import Path | 
					
						
						|  | from typing import Optional | 
					
						
						|  |  | 
					
						
						|  | import lightning as L | 
					
						
						|  | import torch | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | wd = Path(__file__).parent.parent.resolve() | 
					
						
						|  | sys.path.append(str(wd)) | 
					
						
						|  |  | 
					
						
						|  | from generate import generate | 
					
						
						|  | from lit_llama import Tokenizer | 
					
						
						|  | from lit_llama.adapter import LLaMA | 
					
						
						|  | from lit_llama.utils import lazy_load, llama_model_lookup, quantization | 
					
						
						|  | from lit_llama.adapter_v2 import add_adapter_v2_parameters_to_linear_layers | 
					
						
						|  | from scripts.prepare_alpaca import generate_prompt | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def main( | 
					
						
						|  | prompt: str = "What food do lamas eat?", | 
					
						
						|  | input: str = "", | 
					
						
						|  | adapter_path: Path = Path("out/adapter_v2/alpaca/lit-llama-adapter-finetuned.pth"), | 
					
						
						|  | pretrained_path: Path = Path("checkpoints/lit-llama/7B/lit-llama.pth"), | 
					
						
						|  | tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"), | 
					
						
						|  | quantize: Optional[str] = None, | 
					
						
						|  | max_new_tokens: int = 100, | 
					
						
						|  | top_k: int = 200, | 
					
						
						|  | temperature: float = 0.8, | 
					
						
						|  | ) -> None: | 
					
						
						|  | """Generates a response based on a given instruction and an optional input. | 
					
						
						|  | This script will only work with checkpoints from the instruction-tuned LLaMA-Adapter model. | 
					
						
						|  | See `finetune_adapter_v2.py`. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | prompt: The prompt/instruction (Alpaca style). | 
					
						
						|  | adapter_path: Path to the checkpoint with trained adapter weights, which are the output of | 
					
						
						|  | `finetune_adapter_v2.py`. | 
					
						
						|  | input: Optional input (Alpaca style). | 
					
						
						|  | pretrained_path: The path to the checkpoint with pretrained LLaMA weights. | 
					
						
						|  | tokenizer_path: The tokenizer path to load. | 
					
						
						|  | quantize: Whether to quantize the model and using which method: | 
					
						
						|  | ``"llm.int8"``: LLM.int8() mode, | 
					
						
						|  | ``"gptq.int4"``: GPTQ 4-bit mode. | 
					
						
						|  | max_new_tokens: The number of generation steps to take. | 
					
						
						|  | top_k: The number of top most probable tokens to consider in the sampling process. | 
					
						
						|  | temperature: A value controlling the randomness of the sampling process. Higher values result in more random | 
					
						
						|  | samples. | 
					
						
						|  | """ | 
					
						
						|  | assert adapter_path.is_file() | 
					
						
						|  | assert pretrained_path.is_file() | 
					
						
						|  | assert tokenizer_path.is_file() | 
					
						
						|  |  | 
					
						
						|  | precision = "bf16-true" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else "32-true" | 
					
						
						|  | fabric = L.Fabric(devices=1, precision=precision) | 
					
						
						|  |  | 
					
						
						|  | print("Loading model ...", file=sys.stderr) | 
					
						
						|  | t0 = time.time() | 
					
						
						|  | with lazy_load(pretrained_path) as pretrained_checkpoint, lazy_load(adapter_path) as adapter_checkpoint: | 
					
						
						|  | name = llama_model_lookup(pretrained_checkpoint) | 
					
						
						|  |  | 
					
						
						|  | with fabric.init_module(empty_init=True), quantization(mode=quantize): | 
					
						
						|  | model = LLaMA.from_name(name) | 
					
						
						|  | add_adapter_v2_parameters_to_linear_layers(model) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model.load_state_dict(pretrained_checkpoint, strict=False) | 
					
						
						|  |  | 
					
						
						|  | model.load_state_dict(adapter_checkpoint, strict=False) | 
					
						
						|  |  | 
					
						
						|  | print(f"Time to load model: {time.time() - t0:.02f} seconds.", file=sys.stderr) | 
					
						
						|  |  | 
					
						
						|  | model.eval() | 
					
						
						|  | model = fabric.setup(model) | 
					
						
						|  |  | 
					
						
						|  | tokenizer = Tokenizer(tokenizer_path) | 
					
						
						|  | sample = {"instruction": prompt, "input": input} | 
					
						
						|  | prompt = generate_prompt(sample) | 
					
						
						|  | encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device) | 
					
						
						|  | prompt_length = encoded.size(0) | 
					
						
						|  |  | 
					
						
						|  | t0 = time.perf_counter() | 
					
						
						|  | y = generate(model, encoded, max_new_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) | 
					
						
						|  | t = time.perf_counter() - t0 | 
					
						
						|  |  | 
					
						
						|  | model.reset_cache() | 
					
						
						|  | output = tokenizer.decode(y) | 
					
						
						|  | output = output.split("### Response:")[1].strip() | 
					
						
						|  | print(output) | 
					
						
						|  |  | 
					
						
						|  | tokens_generated = y.size(0) - prompt_length | 
					
						
						|  | print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) | 
					
						
						|  | if fabric.device.type == "cuda": | 
					
						
						|  | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | from jsonargparse import CLI | 
					
						
						|  |  | 
					
						
						|  | torch.set_float32_matmul_precision("high") | 
					
						
						|  | warnings.filterwarnings( | 
					
						
						|  |  | 
					
						
						|  | "ignore", | 
					
						
						|  | message="ComplexHalf support is experimental and many operators don't support it yet" | 
					
						
						|  | ) | 
					
						
						|  | CLI(main) | 
					
						
						|  |  |