Image-to-Image
vincie
VINCIE-3B / llm14b /tokenizer_demo.py
leigangqu's picture
Upload folder using huggingface_hub
5096732 verified
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
path = os.getcwd()
# Note: The default behavior now has injection attack prevention off.
# tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
max_length = 77
tokenizer = AutoTokenizer.from_pretrained(
path, model_max_length=max_length,
use_fast=False, trust_remote_code=True,
padding_side="right", truncation_side="right",
add_eod_token=True
)
tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
# tokenizer.pad_token = tokenizer.eot_token
text = "Several years after the first victory over the Gauls, Pergamon was again attacked by the Gauls together with their ally Antiochus Hierax, the younger brother of Seleucus II Callinicus, and ruler of Seleucid Asia Minor from his capital at Sardis. Attalus defeated the Gauls and Antiochus at the Battle of Aphrodisium and again at a second battle in the east. Three subsequent battles were fought and won against Antiochus Hierax's forces, which fought without support from the Gauls: in Hellespontine Phrygia, where Antiochus was perhaps seeking refuge with his father-in law, Ziaelas the king of Bithynia; near Sardis in the spring of 228 BC; and, in the final conflict of the campaign, in Caria at the Battle of the Harpasus, the Harpasus river being a tributary of the Maeander"
tokens = tokenizer([text], return_tensors="pt", padding="max_length", truncation=False)
tokens = tokens.to("cuda")
token_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
num_tokens = attention_mask.sum(dim=1)
print(len(text.split(" ")))
print(num_tokens)