import transformers import torch from tools import tools from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, ) model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" # specify how to quantize the model quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", quantization_config=quantization_config ) messages = [ {"role": "system", "content": "You are a movie search assistant bot who uses TMDB to help users find movies. You should respond with movie IDs and natural language text summaries when asked for movie recommendations. You should only provide the movie ID and the summary, nothing else."}, {"role": "user", "content": "Can you recommend a good action movie?"}, ] inputs = tokenizer.apply_chat_template( messages, tools=tools, add_generation_prompt=True)