OwLim commited on
Commit
ed6c7fb
Β·
verified Β·
1 Parent(s): 8154f62

Upload 7 files

Browse files
Files changed (7) hide show
  1. .gitattributes +37 -35
  2. README.md +13 -13
  3. app.py +180 -0
  4. conformer.png +3 -0
  5. requirement.txt +4 -0
  6. whisper.png +3 -0
  7. whisper_architecture.svg +0 -0
.gitattributes CHANGED
@@ -1,35 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ conformer.png filter=lfs diff=lfs merge=lfs -text
37
+ whisper.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Multilingual Indonesia Whisper Model
3
- emoji: πŸƒ
4
- colorFrom: gray
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.29.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: a fine tuned javaneses and sundanese Whisper model.
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Multilingual Indonesia Whisper Model
3
+ emoji: πŸƒ
4
+ colorFrom: gray
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.29.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: a fine tuned javaneses and sundanese Whisper model.
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch # type: ignore
2
+ import numpy as np # type: ignore
3
+ import gradio as gr # type: ignore
4
+ from transformers import pipeline
5
+
6
+ # Load fine-tuned Whisper model
7
+ transcriber = pipeline("automatic-speech-recognition", model="OwLim/whisper-java-SLR41-SLR35")
8
+
9
+ def transcribe(audio):
10
+ sr, waveform = audio
11
+ # Change into Mono Audio
12
+ if waveform.ndim > 1:
13
+ waveform = waveform.mean(axis=1)
14
+
15
+ # Normalisasi
16
+ waveform = waveform.astype(np.float32)
17
+ waveform /= np.max(np.abs(waveform))
18
+
19
+ return transcriber({
20
+ "sampling_rate" : sr,
21
+ "raw" : waveform
22
+ })["text"]
23
+
24
+ def clear():
25
+ return None, ""
26
+
27
+ # --- Tab 1: Transcribe ---
28
+ with gr.Blocks() as tab_transcribe:
29
+ with gr.Row():
30
+ with gr.Column(scale=1):
31
+ audio_input = gr.Audio(sources="microphone", label="Record Your Voice")
32
+ with gr.Row():
33
+ subBtn = gr.Button("Submit", variant="primary")
34
+ clrBtn = gr.ClearButton(variant="stop")
35
+
36
+ with gr.Column(scale=1):
37
+ output_text = gr.Textbox(label="Transcription", placeholder="Waiting for Input", lines=3)
38
+
39
+ subBtn.click(fn=transcribe, inputs=audio_input, outputs=output_text)
40
+ clrBtn.click(fn=clear, outputs=[audio_input, output_text])
41
+
42
+ # --- Tab 2: Penjelasan Model Fine-Tuned ---
43
+ with gr.Blocks() as tab_background:
44
+ gr.HTML("""
45
+ <h3>Latar Belakang Project:</h3>
46
+ <p>
47
+ Pada project kita kali ini, kami ingin membuat suatu model AI Speech Recognition yang mampu untuk menerima, mengenali dan memproses input berupa ucapan lisan multilingual termasuk dengan adanya bahasa lokal (seperti bahasa daerah). Projek kami didasarkan dengan kurangnya pengaplikasian bidang Speech Recognition pada low-resource language atau bahasa-bahasa yang memiliki data atau sumber daya yang relatif lebih sedikit. Kami ingin membuat model yang dapat mendeteksi pengguna baik berbahasa inggris, berbahasa indonesia maupun berbahasa daerah seperti bahasa jawa.
48
+ </p>
49
+ <br>
50
+ <p>
51
+ Dengan adanya multi-lingual speech recognition, pengolahan lisan dalam bahasa daerah seperti di Indonesia akan lebih terbantu dan semakin banyak pula. Terlebih lagi, pengolahan lisan dalam bahasa daerah masih sedikit dan kurang diperhatikan walaupun di Indonesia sendiri memiliki lebih dari 500 ragam banyaknya. Dari hal ini, kami ingin mengembangkan dua model AI dalam ranah Speech Recognition, yaitu Conformer dan Whisper untuk dapat belajar dan memproses bahasa multilingual.
52
+
53
+ Model yang telah kami fine tune merupakan hasil <b>fine-tuning dari Whisper dan Conformer</b> untuk mendukung bahasa lokal di Indonesia, khususnya bahasa Jawa dan Sunda.
54
+ Model dilatih menggunakan kombinasi dataset <b>OpenSLR</b> berikut:
55
+ <br>
56
+ <a href="https://openslr.org/35/" target="_blank" style="text-decoration:none;>
57
+ <b>SLR35</b> - Large Javanese ASR
58
+ </a>
59
+
60
+ <br>
61
+ <a href="https://openslr.org/41/" target="_blank" style="text-decoration:none;">
62
+ <b>SLR41</b> - High quality TTS data for Javanese
63
+ </a>
64
+
65
+ <br>
66
+ <a href="https://openslr.org/36" target="_blank" style="text-decoration:none;">
67
+ <b>SLR36</b>
68
+ <b>SLR44</b> - Bilingual speech datasets
69
+ </a>
70
+
71
+ <br>
72
+ Model ini diharapkan bisa meningkatkan akurasi untuk bahasa yang sebelumnya kurang terwakili dalam model global.
73
+ </p>
74
+ <h3>Tujuan Project:</h3>
75
+ <ul>
76
+ <li>Dapat memahami ucapan lisan multilingual termasuk dengan bahasa <i>low-resource language</i>.</li>
77
+ <li>Ikut serta dalam pengembangan teknologi dalam pelatihan model pada <i>low-resource language</i>.</li>
78
+ <li>Berpartisipasi dalam pelestarian dan pembudidayaan bahasa-bahasa daerah di Indonesia yang kurang mendapatkan perhatian.</li>
79
+ </ul>
80
+ """)
81
+
82
+ # --- Tab 3: Arsitektur Model ---
83
+ with gr.Blocks() as tab_architecture:
84
+ gr.Markdown("### 🧠 Whisper Architecture")
85
+ with gr.Row():
86
+ with gr.Column():
87
+ gr.HTML("""
88
+ <div>
89
+ <p>
90
+ Whisper adalah model Automatic Speech Recognition (ASR) open-source yang dikembangkan oleh OpenAI.
91
+ Model ini dilatih menggunakan <strong>680,000 jam</strong> data audio multilingual dan multitask,
92
+ termasuk data yang memiliki noise dan hasil transkripsi otomatis untuk meningkatkan robustness.
93
+ </p>
94
+ <p>
95
+ Whisper mampu mentranskrip audio dengan <em>background noise</em>, serta memahami berbagai aksen dan
96
+ bahasa secara efektif.
97
+ </p>
98
+ </div>
99
+ """)
100
+ with gr.Column():
101
+ gr.Image("whisper.png", show_label=False, show_download_button=False)
102
+
103
+ gr.Markdown("### πŸ”Š Conformer Architecture")
104
+ with gr.Row():
105
+ with gr.Column():
106
+ gr.HTML("""
107
+ <div>
108
+ <p>
109
+ <strong>Conformer (Convolutional Transformer)</strong> adalah arsitektur deep learning yang dirancang khusus untuk pengolahan sinyal suara, seperti speech recognition.
110
+ </p>
111
+ <br>
112
+ <p>Model ini menggabungkan dua komponen utama:</p>
113
+ <ul>
114
+ <li><strong>Transformer:</strong> Menangkap hubungan global dalam data, seperti relasi antar kata dalam kalimat.</li>
115
+ <li><strong>CNN (Convolutional Neural Network):</strong> Menangkap pola lokal seperti fonem atau suku kata dalam suara.</li>
116
+ </ul>
117
+ <p>
118
+ Dengan kombinasi ini, Conformer dapat memahami baik konteks global maupun detail lokal dari sinyal suara secara lebih efektif.
119
+ </p>
120
+ </div>
121
+ """)
122
+ with gr.Column():
123
+ gr.Image("conformer.png", show_label=False, show_download_button=False)
124
+
125
+
126
+ # --- Tab 4: Tabel Hasil Evaluasi ---
127
+ with gr.Blocks() as tab_results:
128
+ gr.HTML("""
129
+ <h1 style='text-align:center;'>πŸ“Š Hasil Evaluasi Model</h1>
130
+ <table style="width:80%; margin:20px auto; border-collapse:separate; border-spacing:0; font-family:sans-serif;">
131
+ <thead style="background-color:#eb7434; color:white;">
132
+ <tr>
133
+ <th style="padding:15px 20px; text-align:center;">Dataset</th>
134
+ <th style="padding:15px 20px; text-align:center;">WER (Word Error Rate)</th>
135
+ <th style="padding:15px 20px; text-align:center;">CER (Character Error Rate)</th>
136
+ </tr>
137
+ </thead>
138
+ <tbody>
139
+ <tr>
140
+ <td style="padding:15px 20px; text-align:center;">SLR35 (Javanese)</td>
141
+ <td style="padding:15px 20px; text-align:center;">15.2%</td>
142
+ <td style="padding:15px 20px; text-align:center;">9.8%</td>
143
+ </tr>
144
+ <tr>
145
+ <td style="padding:15px 20px; text-align:center;">SLR41 (Indonesian)</td>
146
+ <td style="padding:15px 20px; text-align:center;">12.4%</td>
147
+ <td style="padding:15px 20px; text-align:center;">8.1%</td>
148
+ </tr>
149
+ <tr>
150
+ <td style="padding:15px 20px; text-align:center;">SLR44 (Multilingual)</td>
151
+ <td style="padding:15px 20px; text-align:center;">17.3%</td>
152
+ <td style="padding:15px 20px; text-align:center;">11.0%</td>
153
+ </tr>
154
+ </tbody>
155
+ </table>
156
+ """)
157
+
158
+ # --- Tab 5: Fine-tuning Info ---
159
+ with gr.Blocks() as tab_authors:
160
+ gr.HTML("""
161
+ <h1 style='text-align:center;'>πŸ‘¨β€πŸ’» Fine-Tuning Information</h1>
162
+ <p style='text-align:center;'>
163
+ Model ini di-fine-tune oleh <b>OwLim</b> menggunakan framework Hugging Face Transformers dan PyTorch Lightning.<br><br>
164
+ Semua pelatihan dilakukan di GPU NVIDIA RTX 3090 dengan total 40+ jam data suara.
165
+ </p>
166
+ <p style='text-align:center;'>
167
+ Lihat model di <a href='https://huggingface.co/OwLim/whisper-java-SLR41-SLR35' target='_blank'>Hugging Face Model Hub</a>.
168
+ </p>
169
+ """)
170
+
171
+ # Gabungkan semua tabs ke dalam aplikasi utama
172
+ demo = gr.TabbedInterface(
173
+ [tab_transcribe, tab_background, tab_architecture, tab_results, tab_authors],
174
+ ["Transcribe", "Latar Belakang", "Arsitektur", "Evaluasi", "Fine-Tuned By"],
175
+ theme=gr.themes.Soft(),
176
+ title="Whisper VS Conformer Model "
177
+ )
178
+
179
+ if __name__ == "__main__":
180
+ demo.launch()
conformer.png ADDED

Git LFS Details

  • SHA256: 9d6bd48b965eb513491aa26a850b0adf502b39afd677f5ad784f2908ba5bf3d3
  • Pointer size: 131 Bytes
  • Size of remote file: 258 kB
requirement.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ numpy
3
+ torchaudio
4
+ transformers
whisper.png ADDED

Git LFS Details

  • SHA256: b3de2b28681581e49cf2855793de94aecbe51052aa751bf224cf6609f82d403a
  • Pointer size: 131 Bytes
  • Size of remote file: 134 kB
whisper_architecture.svg ADDED