Word Timestamps error: RuntimeError: The expanded size of the tensor (69) must match the existing size (72) at non-singleton dimension 1. Target sizes: [1, 69]. Tensor sizes: [72] whisper word.
Hey,
i am using the following code. it works fine when i change return_timestamps = True but converting to "word" gives me this error. i also tried with batch size = 1 and setting tokenizer.config. max token length. My audio is around 1 to 2 hr long.
from logger import logger
import os
from glob import glob
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from utils import ms_to_hmsms, log_vram_usage
class WhisperHFTranscriber:
def init(self, model_id="openai/whisper-large-v3", device="cuda:0"):
logger.info(f"Using device for transcription: {device}")
self._model_id = model_id
self._device = device
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=dtype, low_cpu_mem_usage=True, use_safetensors=True).to(device)
log_vram_usage("Before Whisper HF ram usage")
processor = AutoProcessor.from_pretrained(model_id)
self.pipe = pipeline("automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=30,
batch_size=16,
torch_dtype=dtype,
device=device)
log_vram_usage("After Whisper HF ram usage")
def transcribe_directory(self, input_dir, output_dir):
os.makedirs(output_dir, exist_ok=True)
audio_files = glob(os.path.join(input_dir, "*.wav"))
if not audio_files:
logger.warning(f"No audio files found in {input_dir}. Please check the directory.")
return
logger.info(f"Found {len(audio_files)} audio file(s) to transcribe.")
for audio_path in audio_files:
name = os.path.splitext(os.path.basename(audio_path))[0]
logger.info(f"Transcribing: {name}")
generate_kwargs = {
"language": "de",
}
result = self.pipe(audio_path, return_timestamps=True)
# Save plain text
with open(os.path.join(output_dir, "full_transcription.txt"), "w", encoding="utf-8") as f:
f.write(result.get("text", ""))
#logger.info(f"Chunks: {result.get('chunks')}")
segment_txt_path = os.path.join(output_dir, "segment_timestamps.txt")
with open(segment_txt_path, "w", encoding="utf-8") as f:
for segment in result.get("chunks", []):
timestamp = segment.get("timestamp", None)
seg_text = segment.get("text", "").strip()
seg_start = seg_end = None
if isinstance(timestamp, (list, tuple)) and len(timestamp) == 2:
seg_start, seg_end = timestamp
if seg_start is not None or seg_end is not None:
start_str = f"{seg_start:.2f}" if seg_start is not None else "MISSING"
end_str = f"{seg_end:.2f}" if seg_end is not None else "MISSING"
f.write(f"[{start_str} - {end_str}] {seg_text}\n")
else:
f.write(f"[MISSING TIMESTAMP] {seg_text}\n")
# word_txt_path = os.path.join(output_dir, "word_timestamps.txt")
logger.info(f"Saved: {output_dir}/full_transcription.txt")
#logger.info(f"Saved: {output_dir}/word_timestamps.txt")
logger.info(f"Saved: {output_dir}/segment_timestamps.txt")
torch.cuda.empty_cache()
RuntimeError: The expanded size of the tensor (69) must match the existing size (72) at non-singleton dimension 1. Target sizes: [1, 69]. Tensor sizes: [72] whisper word
Same problem. Once I modify the example code to return_timestamps="word" it is not going to work.