import transformers
import torch
from tools import tools
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextIteratorStreamer,
)

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map="auto", quantization_config=quantization_config
)


messages = [
    {"role": "system", "content": "You are a movie search assistant bot who uses TMDB to help users find movies. You should respond with movie IDs and natural language text summaries when asked for movie recommendations. You should only provide the movie ID and the summary, nothing else."},
    {"role": "user", "content": "Can you recommend a good action movie?"},
]


inputs = tokenizer.apply_chat_template(
    messages, tools=tools, add_generation_prompt=True)