'Qwen2_5_VLProcessor' object has no attribute 'eos_token'
#22
by
itztheking
- opened
Hi, I'm trying to finetune only the text (while keeping vision capabilities) for qwen2.5 VL, specifically: unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit
, but I get the error above when accessing EOS_TOKEN = tokenizer.eos_token
for tokenizer for training.
I checked the tokenizer_config.json, and it does have the field eos_token
field. Why is this happening then?
My code:
from unsloth import FastLanguageModel
import torch
max_seq_length = 10000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit", # "unsloth/mistral-7b" for 16bit loading
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
from unsloth import FastLanguageModel
import torch
max_seq_length = 10000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit", # "unsloth/mistral-7b" for 16bit loading
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = FastLanguageModel.get_peft_model(
model,
r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
"embed_tokens", "lm_head",], # Add for continual pretraining
lora_alpha = 32,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = True, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
from datasets import disable_caching, load_dataset, Dataset, Features, Value
from tqdm.auto import tqdm
# Initialize empty fallback dataset
cleaned_dataset = Dataset.from_dict({"content": []})
print("Loading dataset...")
try:
# Define a union schema covering all columns from any batch.
union_schema = Features({
"document_id": Value("string"),
"filing_date": Value("string"), # present in some batches
"title": Value("string"), # present in some batches
"chunk_index": Value("int64"), # present in some batches
"chunk_id": Value("int64"), # present in some batches
"problem_id": Value("int64"), # present in some batches
"sample_id": Value("int64"), # newly added field
"article_id": Value("int64"), # newly added field
"thread_id": Value("int64"), # newly added field
"source": Value("string"), # from pdf_chunks and others
"content": Value("string")
})
# Load the dataset using the union schema.
dataset = load_dataset("itztheking/FQwen-1.0", split="train[1000:2000]", features=union_schema)
# Remove all extra columns so that only "content" remains.
cleaned_dataset = dataset.remove_columns([
"document_id", "filing_date", "title",
"chunk_index", "chunk_id", "problem_id",
"sample_id", "article_id", "thread_id", "source"
])
print(f"Successfully created dataset with {len(cleaned_dataset)} examples")
except Exception as e:
print(f"Error loading dataset: {e}")
# Display a sample (first 200 characters)
if len(cleaned_dataset) > 0:
sample = cleaned_dataset[0]["content"]
print("\nSample content:")
print(sample[:200] + "..." if len(sample) > 200 else sample)
# Prepare for formatting: ensure your tokenizer is defined with an eos_token.
EOS_TOKEN = tokenizer.eos_token
# Define a formatting function to process batches.
def formatting_prompts_func(batch):
# Append the EOS token to each content string and create a new "text" field.
return {"text": [content + EOS_TOKEN for content in batch["content"]]}
print("\nFormatting dataset for pretraining...")
# Use the .map() method with batched processing and a progress bar.
formatted_dataset = cleaned_dataset.map(
formatting_prompts_func,
batched=True,
)
# Optionally, if you only need the "text" column, remove the original "content" column.
formatted_dataset = formatted_dataset.remove_columns(["content"])
print(f"\nDataset formatted with {len(formatted_dataset)} examples")
if len(formatted_dataset) > 0:
sample_text = formatted_dataset[0]["text"]
print(f"Sample formatted text: {sample_text[:100]}...")