import os import torch import torch.nn as nn import torchaudio import numpy as np # type: ignore import gradio as gr # type: ignore import pandas as pd from transformers import pipeline from huggingface_hub import hf_hub_download from torchaudio.models import Conformer class ASRConformerModel(nn.Module): def __init__(self, input_dim, vocab_size): super().__init__() self.encoder = Conformer( input_dim=input_dim, num_heads=4, ffn_dim=512, num_layers=4, depthwise_conv_kernel_size=31, dropout=0.1 ) self.classifier = nn.Linear(input_dim, vocab_size) def forward(self, x, lengths): x, lengths = self.encoder(x, lengths=lengths) x = self.classifier(x) return x, lengths VOCAB = set("abcdefghijklmnopqrstuvwxyz '") char_to_idx = {ch: i + 1 for i, ch in enumerate(sorted(VOCAB))} # 0 for CTC blank def greedy_decode(log_probs, blank=0): pred_ids = log_probs.argmax(dim=-1) # [T, B] pred_ids = pred_ids.transpose(0, 1) # [B, T] predictions = [] for seq in pred_ids: prev = blank pred = [] for i in seq: if i != prev and i != blank: pred.append(i.item()) prev = i predictions.append(pred) return predictions def encode(text): return torch.tensor([char_to_idx[c] for c in text.lower() if c in char_to_idx], dtype=torch.long) def decode_to_text(predictions, idx_to_char): return [''.join(idx_to_char[i] for i in pred if i in idx_to_char) for pred in predictions] # Load fine-tuned Whisper model transcriber_whisper = pipeline("automatic-speech-recognition", model="OwLim/whisper-sundanese-finetune") transcriber_wav2vec = pipeline("automatic-speech-recognition", model="indonesian-nlp/wav2vec2-indonesian-javanese-sundanese") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") SAMPLE_RATE = 16_000 model_path = hf_hub_download(repo_id="Blebbyblub/javanese-conformer-asrV2", filename="pytorch_model.bin") model = ASRConformerModel(input_dim=80, vocab_size=29).to(device) model.load_state_dict(torch.load(model_path, map_location=device)) examples_audio = [ file for file in os.listdir("./") if file.endswith(".wav") ] idx_to_char = {v: k for k, v in char_to_idx.items()} def transcribe(audio, model_selection): sr, waveform = audio # Change into Mono Audio if waveform.ndim > 1: waveform = waveform.mean(axis=1) # Normalisasi waveform = waveform.astype(np.float32) waveform /= np.max(np.abs(waveform)) if "Conformer" == model_selection : mel_transform = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=80) waveform = torch.from_numpy(waveform).float() if sr != SAMPLE_RATE: waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform) waveform = waveform.unsqueeze(0) mel = mel_transform(waveform).squeeze(0).transpose(0, 1) # [time, mel] mel = mel.unsqueeze(0).to(device) input_length = torch.tensor([mel.size(1)]).to(device) model.eval() with torch.no_grad(): output, output_lengths = model(mel, input_length) log_probs = output.log_softmax(2).transpose(0, 1) pred_ids = greedy_decode(log_probs) pred_text = decode_to_text(pred_ids, idx_to_char)[0] return pred_text if "Wav2Vec" == model_selection : selected_model = transcriber_wav2vec elif "Whisper" == model_selection: selected_model = transcriber_whisper return selected_model({ "sampling_rate" : sr, "raw" : waveform })["text"] def clear(): return None, "" # --- Tab 1: Transcribe --- with gr.Blocks() as tab_transcribe: model_selector = gr.Radio(choices=["Whisper", "Conformer", "Wav2Vec"], label="Choose Model", info="This will effect the model that you use for transcribing", ) with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio(sources="microphone", label="Record Your Voice") with gr.Row(): subBtn = gr.Button("Submit", variant="primary") clrBtn = gr.ClearButton(variant="stop") with gr.Column(scale=1): output_text = gr.Textbox(label="Transcription", placeholder="Waiting for Input", lines=3) gr.Examples( examples=examples_audio, # List of audio file paths inputs=audio_input, label="Try with Example Audio" ) subBtn.click(fn=transcribe, inputs=[audio_input, model_selector], outputs=output_text) clrBtn.click(fn=clear, outputs=[audio_input, output_text]) # --- Tab 2: Penjelasan Model Fine-Tuned --- with gr.Blocks() as tab_background: gr.HTML("""

Latar Belakang Project:

Pada project kita kali ini, kami ingin membuat suatu model AI Speech Recognition yang mampu untuk menerima, mengenali dan memproses input berupa ucapan lisan multilingual termasuk dengan adanya bahasa lokal (seperti bahasa daerah). Projek kami didasarkan dengan kurangnya pengaplikasian bidang Speech Recognition pada low-resource language atau bahasa-bahasa yang memiliki data atau sumber daya yang relatif lebih sedikit. Kami ingin membuat model yang dapat mendeteksi pengguna baik berbahasa inggris, berbahasa indonesia maupun berbahasa daerah seperti bahasa jawa.


Dengan adanya multi-lingual speech recognition, pengolahan lisan dalam bahasa daerah seperti di Indonesia akan lebih terbantu dan semakin banyak pula. Terlebih lagi, pengolahan lisan dalam bahasa daerah masih sedikit dan kurang diperhatikan walaupun di Indonesia sendiri memiliki lebih dari 500 ragam banyaknya. Dari hal ini, kami ingin mengembangkan dua model AI dalam ranah Speech Recognition, yaitu Conformer dan Whisper untuk dapat belajar dan memproses bahasa multilingual. Model yang telah kami fine tune merupakan hasil fine-tuning dari Whisper dan Conformer untuk mendukung bahasa lokal di Indonesia, khususnya bahasa Jawa dan Sunda. Model dilatih menggunakan kombinasi dataset OpenSLR berikut:
SLR35 - Large Javanese ASR training data set
SLR36 - Large Sundanese ASR training data set
SLR41 - High quality TTS data for Javanese
SLR44 - High quality TTS data for Sundanese.
Model ini diharapkan bisa meningkatkan akurasi untuk bahasa yang sebelumnya kurang terwakili dalam model global.

Tujuan Project:

""") # --- Tab 3: Arsitektur Model --- with gr.Blocks() as tab_architecture: gr.Markdown("### 🧠 Whisper Architecture") with gr.Row(): with gr.Column(): gr.HTML("""

Whisper adalah model Automatic Speech Recognition (ASR) open-source yang dikembangkan oleh OpenAI. Model ini dilatih menggunakan 680,000 jam data audio multilingual dan multitask, termasuk data yang memiliki noise dan hasil transkripsi otomatis untuk meningkatkan robustness.

Whisper mampu mentranskrip audio dengan background noise, serta memahami berbagai aksen dan bahasa secara efektif.

""") with gr.Column(): gr.Image("whisper.png", show_label=False, show_download_button=False) gr.Markdown("### 🔊 Conformer Architecture") with gr.Row(): with gr.Column(): gr.HTML("""

Conformer (Convolutional Transformer) adalah arsitektur deep learning yang dirancang khusus untuk pengolahan sinyal suara, seperti speech recognition.


Model ini menggabungkan dua komponen utama:

Dengan kombinasi ini, Conformer dapat memahami baik konteks global maupun detail lokal dari sinyal suara secara lebih efektif.

""") with gr.Column(): gr.Image("conformer.png", show_label=False, show_download_button=False) import gradio as gr import pandas as pd data_wer = { "Model": ["Whisper", "Conformer"], "WER": [11, 50], } data_cer = { "Model": ["Whisper", "Conformer"], 'CER': [0, 20] } df_WER = pd.DataFrame(data_wer) df_CER = pd.DataFrame(data_cer) # --- Tab 4: Tabel Hasil Evaluasi --- with gr.Blocks() as tab_results: gr.Markdown("## 📊 Best Error Rate (WER / CER)") with gr.Row(): gr.BarPlot( df_WER, x="Model", y="WER", title="Best WER by Model", tooltip=["Model", "WER"], y_lim=(0, 100), container=False, height=400, width=300 ) gr.BarPlot( df_CER, x="Model", y="CER", title="Best CER by Model", tooltip=["Model", "CER"], y_lim=(0, 100), container=False, height=400, width=300 ) # --- Tab 5: Fine-tuning Info --- with gr.Blocks() as tab_authors: gr.HTML("""

👨‍💻 Fine-Tuning Information

Model ini di-fine-tune oleh Brian, Nathan, Owen, Raenault menggunakan framework Hugging Face Transformers dan PyTorch

Semua pelatihan dilakukan di Google Colab.

Lihat model di Hugging Face Model Hub.

""") # Gabungkan semua tabs ke dalam aplikasi utama demo = gr.TabbedInterface( [tab_transcribe, tab_background, tab_architecture, tab_results, tab_authors], ["Transcribe", "Latar Belakang", "Arsitektur", "Evaluasi", "Fine-Tuned By"], theme=gr.themes.Soft(), title="Multilingual ASR Model" ) if __name__ == "__main__": demo.launch()