{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "59df7f41", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From c:\\Python311\\Lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n", "\n", "Using device: cuda:0\n", "Using dtype: torch.float32\n" ] } ], "source": [ "import torch\n", "import librosa\n", "from transformers import pipeline\n", "import IPython.display as ipd\n", "import os\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"transformers.modeling_utils\")\n", "\n", "model_id = \"ysdede/whisper-khanacademy-large-v3-turbo-tr\"\n", "\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", "torch_dtype = torch.float32 # Use above line instead if you have a GPU with float16 support\n", "\n", "print(f\"Using device: {device}\")\n", "print(f\"Using dtype: {torch_dtype}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1aa91bb6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Checking for audio file: N:\\dataset_v3\\commonvoice_17_tr\\commonvoice_17_tr_fixed\\test\\common_voice_tr_40035941.mp3...\n", "Audio file found. Duration: 7.40 seconds.\n", "Displaying audio (if in a compatible environment):\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "audio_path = r'N:\\dataset_v3\\commonvoice_17_tr\\commonvoice_17_tr_fixed\\test\\common_voice_tr_40035941.mp3'\n", "\n", "print(f\"\\nChecking for audio file: {audio_path}...\")\n", "if not os.path.exists(audio_path):\n", " print(f\"Error: Audio file not found at '{audio_path}'.\")\n", " print(\"Please make sure the 'audio_path' variable is set correctly above.\")\n", "\n", " raise FileNotFoundError(f\"Audio file not found: {audio_path}\")\n", "else:\n", " try:\n", " waveform, sr = librosa.load(audio_path, sr=16000)\n", " print(f\"Audio file found. Duration: {len(waveform)/sr:.2f} seconds.\")\n", " print(\"Displaying audio (if in a compatible environment):\")\n", " ipd.display(ipd.Audio(waveform, rate=sr))\n", " except Exception as e:\n", " print(f\"Could not load or display audio preview: {e}\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "9b0d143a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Initializing ASR pipeline for model: ysdede/whisper-khanacademy-large-v3-turbo-tr...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Pipeline initialized.\n", "\n", "Starting transcription for 'N:\\dataset_v3\\commonvoice_17_tr\\commonvoice_17_tr_fixed\\test\\common_voice_tr_40035941.mp3'...\n", "Language: tr, Task: transcribe\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Python311\\Lib\\site-packages\\transformers\\models\\whisper\\modeling_whisper.py:691: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at ..\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:263.)\n", " attn_output = torch.nn.functional.scaled_dot_product_attention(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "==================== Transcription Result ====================\n", " Nitekim, cam üreticilerini saf olmayan camlar üretmeye ikna edemeyince, kendi camlarını yapmaya başlamıştır.\n", "==============================================================\n", "\n", "Script finished.\n" ] } ], "source": [ "print(f\"\\nInitializing ASR pipeline for model: {model_id}...\")\n", "\n", "try:\n", " pipe = pipeline(\n", " \"automatic-speech-recognition\",\n", " model=model_id,\n", " torch_dtype=torch_dtype,\n", " device=device,\n", " )\n", " print(\"Pipeline initialized.\")\n", "\n", " # Define generation arguments: language 'tr' (Turkish), task 'transcribe'\n", " # Change 'tr' to another language code (e.g., 'en') if your audio is different\n", " # Use task='translate' if you want to translate the audio to English\n", " generation_args = {\n", " \"language\": \"tr\",\n", " \"task\": \"transcribe\"\n", " }\n", "\n", " print(f\"\\nStarting transcription for '{audio_path}'...\")\n", " print(f\"Language: {generation_args['language']}, Task: {generation_args['task']}\")\n", " result = pipe(audio_path, generate_kwargs=generation_args, max_new_tokens=440)\n", "\n", " print(\"\\n\" + \"=\"*20 + \" Transcription Result \" + \"=\"*20)\n", " print(result[\"text\"])\n", " print(\"=\"* (40 + len(\" Transcription Result \")))\n", "\n", "except Exception as e:\n", " print(f\"\\nAn error occurred during pipeline execution: {e}\")\n", " print(\"Please check the model ID, audio file path, and available resources (RAM/VRAM).\")\n", "\n", "print(\"\\nScript finished.\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }