{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "59df7f41",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From c:\\Python311\\Lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
"\n",
"Using device: cuda:0\n",
"Using dtype: torch.float32\n"
]
}
],
"source": [
"import torch\n",
"import librosa\n",
"from transformers import pipeline\n",
"import IPython.display as ipd\n",
"import os\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"transformers.modeling_utils\")\n",
"\n",
"model_id = \"ysdede/whisper-khanacademy-large-v3-turbo-tr\"\n",
"\n",
"device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
"# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
"torch_dtype = torch.float32 # Use above line instead if you have a GPU with float16 support\n",
"\n",
"print(f\"Using device: {device}\")\n",
"print(f\"Using dtype: {torch_dtype}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1aa91bb6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Checking for audio file: N:\\dataset_v3\\commonvoice_17_tr\\commonvoice_17_tr_fixed\\test\\common_voice_tr_40035941.mp3...\n",
"Audio file found. Duration: 7.40 seconds.\n",
"Displaying audio (if in a compatible environment):\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"audio_path = r'N:\\dataset_v3\\commonvoice_17_tr\\commonvoice_17_tr_fixed\\test\\common_voice_tr_40035941.mp3'\n",
"\n",
"print(f\"\\nChecking for audio file: {audio_path}...\")\n",
"if not os.path.exists(audio_path):\n",
" print(f\"Error: Audio file not found at '{audio_path}'.\")\n",
" print(\"Please make sure the 'audio_path' variable is set correctly above.\")\n",
"\n",
" raise FileNotFoundError(f\"Audio file not found: {audio_path}\")\n",
"else:\n",
" try:\n",
" waveform, sr = librosa.load(audio_path, sr=16000)\n",
" print(f\"Audio file found. Duration: {len(waveform)/sr:.2f} seconds.\")\n",
" print(\"Displaying audio (if in a compatible environment):\")\n",
" ipd.display(ipd.Audio(waveform, rate=sr))\n",
" except Exception as e:\n",
" print(f\"Could not load or display audio preview: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9b0d143a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Initializing ASR pipeline for model: ysdede/whisper-khanacademy-large-v3-turbo-tr...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline initialized.\n",
"\n",
"Starting transcription for 'N:\\dataset_v3\\commonvoice_17_tr\\commonvoice_17_tr_fixed\\test\\common_voice_tr_40035941.mp3'...\n",
"Language: tr, Task: transcribe\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Python311\\Lib\\site-packages\\transformers\\models\\whisper\\modeling_whisper.py:691: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at ..\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:263.)\n",
" attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"==================== Transcription Result ====================\n",
" Nitekim, cam üreticilerini saf olmayan camlar üretmeye ikna edemeyince, kendi camlarını yapmaya başlamıştır.\n",
"==============================================================\n",
"\n",
"Script finished.\n"
]
}
],
"source": [
"print(f\"\\nInitializing ASR pipeline for model: {model_id}...\")\n",
"\n",
"try:\n",
" pipe = pipeline(\n",
" \"automatic-speech-recognition\",\n",
" model=model_id,\n",
" torch_dtype=torch_dtype,\n",
" device=device,\n",
" )\n",
" print(\"Pipeline initialized.\")\n",
"\n",
" # Define generation arguments: language 'tr' (Turkish), task 'transcribe'\n",
" # Change 'tr' to another language code (e.g., 'en') if your audio is different\n",
" # Use task='translate' if you want to translate the audio to English\n",
" generation_args = {\n",
" \"language\": \"tr\",\n",
" \"task\": \"transcribe\"\n",
" }\n",
"\n",
" print(f\"\\nStarting transcription for '{audio_path}'...\")\n",
" print(f\"Language: {generation_args['language']}, Task: {generation_args['task']}\")\n",
" result = pipe(audio_path, generate_kwargs=generation_args, max_new_tokens=440)\n",
"\n",
" print(\"\\n\" + \"=\"*20 + \" Transcription Result \" + \"=\"*20)\n",
" print(result[\"text\"])\n",
" print(\"=\"* (40 + len(\" Transcription Result \")))\n",
"\n",
"except Exception as e:\n",
" print(f\"\\nAn error occurred during pipeline execution: {e}\")\n",
" print(\"Please check the model ID, audio file path, and available resources (RAM/VRAM).\")\n",
"\n",
"print(\"\\nScript finished.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}