|
import os |
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from transformers.generation import GenerationConfig |
|
|
|
path = os.getcwd() |
|
|
|
|
|
|
|
max_length = 77 |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
path, model_max_length=max_length, |
|
use_fast=False, trust_remote_code=True, |
|
padding_side="right", truncation_side="right", |
|
add_eod_token=True |
|
) |
|
tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'}) |
|
|
|
text = "Several years after the first victory over the Gauls, Pergamon was again attacked by the Gauls together with their ally Antiochus Hierax, the younger brother of Seleucus II Callinicus, and ruler of Seleucid Asia Minor from his capital at Sardis. Attalus defeated the Gauls and Antiochus at the Battle of Aphrodisium and again at a second battle in the east. Three subsequent battles were fought and won against Antiochus Hierax's forces, which fought without support from the Gauls: in Hellespontine Phrygia, where Antiochus was perhaps seeking refuge with his father-in law, Ziaelas the king of Bithynia; near Sardis in the spring of 228 BC; and, in the final conflict of the campaign, in Caria at the Battle of the Harpasus, the Harpasus river being a tributary of the Maeander" |
|
tokens = tokenizer([text], return_tensors="pt", padding="max_length", truncation=False) |
|
tokens = tokens.to("cuda") |
|
token_ids = tokens["input_ids"] |
|
attention_mask = tokens["attention_mask"] |
|
num_tokens = attention_mask.sum(dim=1) |
|
print(len(text.split(" "))) |
|
print(num_tokens) |
|
|