Jan 4

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import numpy as np
import os

Load model and processor

model_name = "openai/whisper-large" # Replace with your desired Whisper model
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

Load and process audio

audio_file = "converted_audio.wav" # Replace with your audio file path
audio, sampling_rate = sf.read(audio_file)

Split audio into 10-second chunks

chunk_duration = 10 # seconds
chunk_size = sampling_rate * chunk_duration
num_chunks = (len(audio) + chunk_size - 1) // chunk_size # Calculate number of chunks
chunks = [audio[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]

Ensure output folder exists for temporary chunk files

temp_folder = "temp_audio_chunks"
os.makedirs(temp_folder, exist_ok=True)

Transcribe each chunk

transcriptions = []
for idx, chunk in enumerate(chunks):
temp_chunk_path = os.path.join(temp_folder, f"chunk_{idx + 1}.wav")
sf.write(temp_chunk_path, chunk, sampling_rate) # Save chunk to file

# Process the audio chunk
inputs = processor(chunk, sampling_rate=sampling_rate, return_tensors="pt", language="en")
input_features = inputs.input_features  # Use input_features instead of input_ids

# Generate the transcription using the input features (no need for attention_mask)
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
transcriptions.append(transcription)
print(f"Transcribed chunk {idx + 1}/{num_chunks}: {transcription}")

Combine transcriptions

full_transcription = " ".join(transcriptions)

Save transcription to a text file

output_file = "transcription_output.txt"
with open(output_file, "w", encoding="utf-8") as file:
file.write(full_transcription)

Clean up temporary files

for chunk_file in os.listdir(temp_folder):
os.remove(os.path.join(temp_folder, chunk_file))
os.rmdir(temp_folder)

print(f"Full transcription saved to {output_file}")

this is the code that im using.

iamgrootns changed discussion title from Whisper hindi small giving an Error While Whisper Large v3 is able to transcribe the Audio with the Same Code. to Whisper hindi small giving an Error While Whisper Large v3 is able to transcribe the Audio with the Same Code.(Kind of Urgent) Jan 4

lewington

Feb 21

•

edited Feb 22

this will fix the error

transcribe.generation_config.suppress_tokens = None

to begin with it is [] and transformers does not allow this, it has to be None, not empty