Demo-Llama-Guard-3-1B

Running

File size: 1,737 Bytes

52b129d
e2fac8d
 
6293678
29e0785
6293678
52b129d
 
 
 
e2fac8d
 
 
 
 
 
 
52b129d
e2fac8d
 
 
 
52b129d
 
e2fac8d
 
 
 
 
83fe2ae
e2fac8d
 
 
 
 
 
 
 
 
 
 
83fe2ae
e2fac8d
 
 
 
 
 
 
 
29e0785
e2fac8d

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gradio as gr
import spaces

huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
if not huggingface_token:
    raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")

model_id = "meta-llama/Llama-Guard-3-8B-INT8"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

def load_model():
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=huggingface_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=dtype, 
        device_map=device, 
        quantization_config=quantization_config,
        use_auth_token=huggingface_token
    )
    return tokenizer, model

tokenizer, model = load_model()

@spaces.GPU
def moderate(user_input, assistant_response):
    chat = [
        {"role": "user", "content": user_input},
        {"role": "assistant", "content": assistant_response},
    ]
    input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt").to(device)
    output = model.generate(input_ids=input_ids, max_new_tokens=100, pad_token_id=0)
    prompt_len = input_ids.shape[-1]
    return tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)

iface = gr.Interface(
    fn=moderate,
    inputs=[
        gr.Textbox(lines=3, label="User Input"),
        gr.Textbox(lines=3, label="Assistant Response")
    ],
    outputs=gr.Textbox(label="Moderation Result"),
    title="Llama Guard Moderation",
    description="Enter a user input and an assistant response to check for content moderation."
)

if __name__ == "__main__":
    iface.launch()