'Qwen2_5_VLProcessor' object has no attribute 'eos_token'

#22
by itztheking - opened

Hi, I'm trying to finetune only the text (while keeping vision capabilities) for qwen2.5 VL, specifically: unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit, but I get the error above when accessing EOS_TOKEN = tokenizer.eos_token for tokenizer for training.

I checked the tokenizer_config.json, and it does have the field eos_token field. Why is this happening then?

My code:

from unsloth import FastLanguageModel
import torch
max_seq_length = 10000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit", # "unsloth/mistral-7b" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

from unsloth import FastLanguageModel
import torch
max_seq_length = 10000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit", # "unsloth/mistral-7b" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

from datasets import disable_caching, load_dataset, Dataset, Features, Value
from tqdm.auto import tqdm


# Initialize empty fallback dataset
cleaned_dataset = Dataset.from_dict({"content": []})

print("Loading dataset...")

try:
    # Define a union schema covering all columns from any batch.
    union_schema = Features({
        "document_id": Value("string"),
        "filing_date": Value("string"),   # present in some batches
        "title": Value("string"),           # present in some batches
        "chunk_index": Value("int64"),      # present in some batches
        "chunk_id": Value("int64"),         # present in some batches
        "problem_id": Value("int64"),       # present in some batches
        "sample_id": Value("int64"),        # newly added field
        "article_id": Value("int64"),       # newly added field
        "thread_id": Value("int64"),        # newly added field
        "source": Value("string"),          # from pdf_chunks and others
        "content": Value("string")
    })
    
    # Load the dataset using the union schema.
    dataset = load_dataset("itztheking/FQwen-1.0", split="train[1000:2000]", features=union_schema)
    
    # Remove all extra columns so that only "content" remains.
    cleaned_dataset = dataset.remove_columns([
        "document_id", "filing_date", "title",
        "chunk_index", "chunk_id", "problem_id",
        "sample_id", "article_id", "thread_id", "source"
    ])
    
    print(f"Successfully created dataset with {len(cleaned_dataset)} examples")
    
except Exception as e:
    print(f"Error loading dataset: {e}")

# Display a sample (first 200 characters)
if len(cleaned_dataset) > 0:
    sample = cleaned_dataset[0]["content"]
    print("\nSample content:")
    print(sample[:200] + "..." if len(sample) > 200 else sample)

# Prepare for formatting: ensure your tokenizer is defined with an eos_token.
EOS_TOKEN = tokenizer.eos_token

# Define a formatting function to process batches.
def formatting_prompts_func(batch):
    # Append the EOS token to each content string and create a new "text" field.
    return {"text": [content + EOS_TOKEN for content in batch["content"]]}

print("\nFormatting dataset for pretraining...")
# Use the .map() method with batched processing and a progress bar.
formatted_dataset = cleaned_dataset.map(
    formatting_prompts_func,
    batched=True,
)

# Optionally, if you only need the "text" column, remove the original "content" column.
formatted_dataset = formatted_dataset.remove_columns(["content"])

print(f"\nDataset formatted with {len(formatted_dataset)} examples")
if len(formatted_dataset) > 0:
    sample_text = formatted_dataset[0]["text"]
    print(f"Sample formatted text: {sample_text[:100]}...")
Your need to confirm your account before you can post a new comment.

Sign up or log in to comment