openai/whisper-large-v3 · Word Timestamps error: RuntimeError: The expanded size of the tensor (69) must match the existing size (72) at non-singleton dimension 1. Target sizes: [1, 69]. Tensor sizes: [72] whisper word.

Jun 11

Hey,

i am using the following code. it works fine when i change return_timestamps = True but converting to "word" gives me this error. i also tried with batch size = 1 and setting tokenizer.config. max token length. My audio is around 1 to 2 hr long.

from logger import logger
import os
from glob import glob
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from utils import ms_to_hmsms, log_vram_usage

class WhisperHFTranscriber:
def init(self, model_id="openai/whisper-large-v3", device="cuda:0"):
logger.info(f"Using device for transcription: {device}")
self._model_id = model_id
self._device = device
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=dtype, low_cpu_mem_usage=True, use_safetensors=True).to(device)

    log_vram_usage("Before Whisper HF ram usage")
    processor = AutoProcessor.from_pretrained(model_id)

    self.pipe = pipeline("automatic-speech-recognition",
                         model=model,
                         tokenizer=processor.tokenizer,
                         feature_extractor=processor.feature_extractor,
                         chunk_length_s=30,
                         batch_size=16,
                         torch_dtype=dtype,
                         device=device)
    log_vram_usage("After Whisper HF ram usage")
    
    

def transcribe_directory(self, input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    audio_files = glob(os.path.join(input_dir, "*.wav"))

    if not audio_files:
        logger.warning(f"No audio files found in {input_dir}. Please check the directory.")
        return

    logger.info(f"Found {len(audio_files)} audio file(s) to transcribe.")

    for audio_path in audio_files:
        name = os.path.splitext(os.path.basename(audio_path))[0]
        logger.info(f"Transcribing: {name}")

        generate_kwargs = {
            "language": "de",
        }
        result = self.pipe(audio_path, return_timestamps=True)


        # Save plain text
        with open(os.path.join(output_dir, "full_transcription.txt"), "w", encoding="utf-8") as f:
            f.write(result.get("text", ""))

        #logger.info(f"Chunks: {result.get('chunks')}")


        segment_txt_path = os.path.join(output_dir, "segment_timestamps.txt")

        with open(segment_txt_path, "w", encoding="utf-8") as f:
            for segment in result.get("chunks", []):
                timestamp = segment.get("timestamp", None)
                seg_text = segment.get("text", "").strip()

                seg_start = seg_end = None

                if isinstance(timestamp, (list, tuple)) and len(timestamp) == 2:
                    seg_start, seg_end = timestamp

                if seg_start is not None or seg_end is not None:
                    start_str = f"{seg_start:.2f}" if seg_start is not None else "MISSING"
                    end_str = f"{seg_end:.2f}" if seg_end is not None else "MISSING"
                    f.write(f"[{start_str} - {end_str}] {seg_text}\n")
                else:
                    f.write(f"[MISSING TIMESTAMP] {seg_text}\n")

        # word_txt_path = os.path.join(output_dir, "word_timestamps.txt")

        logger.info(f"Saved: {output_dir}/full_transcription.txt")
        #logger.info(f"Saved: {output_dir}/word_timestamps.txt")
        logger.info(f"Saved: {output_dir}/segment_timestamps.txt")
        torch.cuda.empty_cache()

RuntimeError: The expanded size of the tensor (69) must match the existing size (72) at non-singleton dimension 1. Target sizes: [1, 69]. Tensor sizes: [72] whisper word

xiqwertyui

Jun 12

Same problem. Once I modify the example code to return_timestamps="word" it is not going to work.

arjunc02

about 8 hours ago

Ran into the same issue but upgrading to transformers==4.54.1 resolves it for me