In [None]:
import torch
import librosa
from transformers import pipeline
import IPython.display as ipd
import os
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="transformers.modeling_utils")

model_id = "ysdede/whisper-khanacademy-large-v3-turbo-tr"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
torch_dtype = torch.float32 # Use above line instead if you have a GPU with float16 support

print(f"Using device: {device}")
print(f"Using dtype: {torch_dtype}")




Using device: cuda:0
Using dtype: torch.float32


In [None]:

audio_path = r'N:\dataset_v3\commonvoice_17_tr\commonvoice_17_tr_fixed\test\common_voice_tr_40035941.mp3'

print(f"\nChecking for audio file: {audio_path}...")
if not os.path.exists(audio_path):
 print(f"Error: Audio file not found at '{audio_path}'.")
 print("Please make sure the 'audio_path' variable is set correctly above.")

 raise FileNotFoundError(f"Audio file not found: {audio_path}")
else:
 try:
 waveform, sr = librosa.load(audio_path, sr=16000)
 print(f"Audio file found. Duration: {len(waveform)/sr:.2f} seconds.")
 print("Displaying audio (if in a compatible environment):")
 ipd.display(ipd.Audio(waveform, rate=sr))
 except Exception as e:
 print(f"Could not load or display audio preview: {e}")


Checking for audio file: N:\dataset_v3\commonvoice_17_tr\commonvoice_17_tr_fixed\test\common_voice_tr_40035941.mp3...
Audio file found. Duration: 7.40 seconds.
Displaying audio (if in a compatible environment):


In [3]:
print(f"\nInitializing ASR pipeline for model: {model_id}...")

try:
 pipe = pipeline(
 "automatic-speech-recognition",
 model=model_id,
 torch_dtype=torch_dtype,
 device=device,
 )
 print("Pipeline initialized.")

 # Define generation arguments: language 'tr' (Turkish), task 'transcribe'
 # Change 'tr' to another language code (e.g., 'en') if your audio is different
 # Use task='translate' if you want to translate the audio to English
 generation_args = {
 "language": "tr",
 "task": "transcribe"
 }

 print(f"\nStarting transcription for '{audio_path}'...")
 print(f"Language: {generation_args['language']}, Task: {generation_args['task']}")
 result = pipe(audio_path, generate_kwargs=generation_args, max_new_tokens=440)

 print("\n" + "="*20 + " Transcription Result " + "="*20)
 print(result["text"])
 print("="* (40 + len(" Transcription Result ")))

except Exception as e:
 print(f"\nAn error occurred during pipeline execution: {e}")
 print("Please check the model ID, audio file path, and available resources (RAM/VRAM).")

print("\nScript finished.")


Initializing ASR pipeline for model: ysdede/whisper-khanacademy-large-v3-turbo-tr...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Pipeline initialized.

Starting transcription for 'N:\dataset_v3\commonvoice_17_tr\commonvoice_17_tr_fixed\test\common_voice_tr_40035941.mp3'...
Language: tr, Task: transcribe


 attn_output = torch.nn.functional.scaled_dot_product_attention(



 Nitekim, cam üreticilerini saf olmayan camlar üretmeye ikna edemeyince, kendi camlarını yapmaya başlamıştır.

Script finished.
