Commit
·
6fd5ec9
0
Parent(s):
初始提交 - 不包含二進制檔案
Browse files- .gitattributes +37 -0
- .gitignore +65 -0
- .vscodesettings_backup.json +0 -0
- README.md +49 -0
- app.py +966 -0
- assets/audio.wav +3 -0
- assets/sample_audio.mp3 +3 -0
- pyproject.toml +11 -0
- pyrightconfig.json +5 -0
- requirements.txt +22 -0
- requirements_local.txt +30 -0
.gitattributes
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python 虛擬環境
|
2 |
+
env/
|
3 |
+
env_new/
|
4 |
+
env_old/
|
5 |
+
venv/
|
6 |
+
ENV/
|
7 |
+
.env
|
8 |
+
.venv
|
9 |
+
*env*/ # 匹配所有包含 'env' 的資料夾
|
10 |
+
|
11 |
+
# Python 快取檔案
|
12 |
+
__pycache__/
|
13 |
+
*.py[cod]
|
14 |
+
*$py.class
|
15 |
+
*.so
|
16 |
+
.Python
|
17 |
+
|
18 |
+
# 分發/打包
|
19 |
+
.Python
|
20 |
+
build/
|
21 |
+
develop-eggs/
|
22 |
+
dist/
|
23 |
+
downloads/
|
24 |
+
eggs/
|
25 |
+
.eggs/
|
26 |
+
lib/
|
27 |
+
lib64/
|
28 |
+
parts/
|
29 |
+
sdist/
|
30 |
+
var/
|
31 |
+
wheels/
|
32 |
+
*.egg-info/
|
33 |
+
.installed.cfg
|
34 |
+
*.egg
|
35 |
+
|
36 |
+
# 單元測試/覆蓋率報告
|
37 |
+
htmlcov/
|
38 |
+
.tox/
|
39 |
+
.coverage
|
40 |
+
.coverage.*
|
41 |
+
.cache
|
42 |
+
nosetests.xml
|
43 |
+
coverage.xml
|
44 |
+
*.cover
|
45 |
+
.hypothesis/
|
46 |
+
|
47 |
+
# Jupyter Notebook
|
48 |
+
.ipynb_checkpoints
|
49 |
+
|
50 |
+
# PyCharm, VSCode 等 IDE 設定
|
51 |
+
.idea/
|
52 |
+
.vscode/
|
53 |
+
*.swp
|
54 |
+
*.swo
|
55 |
+
|
56 |
+
# 操作系統相關
|
57 |
+
.DS_Store
|
58 |
+
Thumbs.db
|
59 |
+
|
60 |
+
# 專案特定
|
61 |
+
*.log
|
62 |
+
*.sqlite3
|
63 |
+
"assets/*.wav"
|
64 |
+
"assets/*.mp3"
|
65 |
+
"env/"
|
.vscodesettings_backup.json
ADDED
File without changes
|
README.md
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Automatic Speech Recognition Speech To Text
|
3 |
+
emoji: 🔊🔄📝
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.26.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
short_description: Automatic-Speech-Recognition-Speech-to-Text
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
15 |
+
|
16 |
+
# Clone repository
|
17 |
+
git clone https://huggingface.co/spaces/hchcsuim/Automatic-Speech-Recognition-Speech-to-Text
|
18 |
+
cd Automatic-Speech-Recognition-Speech-to-Text
|
19 |
+
|
20 |
+
# windows 安裝 ffmpeg
|
21 |
+
https://ffmpeg.org/download.html
|
22 |
+
下載版本 ffmpeg-git-full.7z
|
23 |
+
解壓縮到 C:\ffmpeg
|
24 |
+
加入環境變數 系統變數 path C:\ffmpeg\bin
|
25 |
+
|
26 |
+
# 要在電腦環境先安裝 python 3.10
|
27 |
+
|
28 |
+
# Create and activate Python environment
|
29 |
+
python -m venv env
|
30 |
+
source env/bin/activate # for linux
|
31 |
+
env\Scripts\activate # for windows
|
32 |
+
|
33 |
+
# 或者在 vscode 手動切換預設環境
|
34 |
+
Ctrl+Shift+P
|
35 |
+
Python: Select Interpreter
|
36 |
+
|
37 |
+
# Install dependencies
|
38 |
+
# requirements.txt is for Hugging Face Spaces
|
39 |
+
pip install -r requirements_local.txt
|
40 |
+
|
41 |
+
# 驗證 GPU 支援
|
42 |
+
(env_new) C:\Users\user\Automatic-Speech-Recognition-Speech-to-Text>python
|
43 |
+
import torch
|
44 |
+
print(f"CUDA available: {torch.cuda.is_available()}")
|
45 |
+
if torch.cuda.is_available():
|
46 |
+
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
47 |
+
|
48 |
+
# Run the app
|
49 |
+
python app.py
|
app.py
ADDED
@@ -0,0 +1,966 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
# from PIL import Image # Keep commented unless needed
|
3 |
+
import torch
|
4 |
+
from transformers import pipeline # Keep pipeline for standard models
|
5 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoConfig, AutoModelForCausalLM
|
6 |
+
import yt_dlp
|
7 |
+
import tempfile
|
8 |
+
import os
|
9 |
+
import shutil
|
10 |
+
import numpy as np
|
11 |
+
import time # For timestamp formatting
|
12 |
+
import soundfile as sf # For reading audio info
|
13 |
+
import traceback # For printing full errors
|
14 |
+
import platform
|
15 |
+
import re
|
16 |
+
import subprocess
|
17 |
+
|
18 |
+
# --- 硬體檢查函數 ---
|
19 |
+
def get_hardware_info():
|
20 |
+
"""獲取 CPU 和 GPU 信息"""
|
21 |
+
# 獲取 CPU 信息
|
22 |
+
cpu_info = "Unknown CPU"
|
23 |
+
try:
|
24 |
+
if platform.system() == "Windows":
|
25 |
+
output = subprocess.check_output("wmic cpu get name", shell=True).decode().strip().split('\n')
|
26 |
+
if len(output) >= 2:
|
27 |
+
cpu_info = output[1].strip()
|
28 |
+
elif platform.system() == "Linux":
|
29 |
+
with open('/proc/cpuinfo', 'r') as f:
|
30 |
+
for line in f:
|
31 |
+
if line.startswith('model name'):
|
32 |
+
cpu_info = line.split(':')[1].strip()
|
33 |
+
break
|
34 |
+
elif platform.system() == "Darwin": # macOS
|
35 |
+
output = subprocess.check_output("sysctl -n machdep.cpu.brand_string", shell=True).decode().strip()
|
36 |
+
cpu_info = output
|
37 |
+
except Exception as e:
|
38 |
+
print(f"Error getting CPU info: {e}")
|
39 |
+
|
40 |
+
# 獲取 GPU 信息
|
41 |
+
gpu_info = None
|
42 |
+
if torch.cuda.is_available():
|
43 |
+
try:
|
44 |
+
gpu_info = torch.cuda.get_device_name(0)
|
45 |
+
# print(f"GPU detected: {gpu_info}")
|
46 |
+
except Exception as e:
|
47 |
+
print(f"Error getting GPU info: {e}")
|
48 |
+
|
49 |
+
# 保留 CPU 和 GPU 的完整名稱
|
50 |
+
# 不進行簡化,直接返回完整名稱
|
51 |
+
|
52 |
+
return cpu_info, gpu_info
|
53 |
+
|
54 |
+
# --- Global Variables ---
|
55 |
+
pipe = None
|
56 |
+
phi4_model = None
|
57 |
+
phi4_processor = None
|
58 |
+
current_model_name = None
|
59 |
+
current_device = "cpu" # 默認使用 CPU
|
60 |
+
|
61 |
+
# --- Model Data ---
|
62 |
+
PHI4_MODEL_ID = "microsoft/Phi-4-multimodal-instruct"
|
63 |
+
MERALION_MODEL_ID = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
|
64 |
+
SEALLM_MODEL_ID = "SeaLLMs/SeaLLMs-Audio-7B"
|
65 |
+
|
66 |
+
MODEL_DATA = [
|
67 |
+
{"id": "openai/whisper-tiny", "params": "~39M", "size": "151 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"},
|
68 |
+
{"id": "openai/whisper-base", "params": "~74M", "size": "290 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"},
|
69 |
+
{"id": "openai/whisper-small", "params": "~244M", "size": "967 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"},
|
70 |
+
{"id": "openai/whisper-medium", "params": "~769M", "size": "3.06 GB", "status_en": "Available (CPU Slow)", "status_zh": "可用 (CPU 慢)", "type": "whisper"},
|
71 |
+
{"id": "openai/whisper-large", "params": "~1.55B", "size": "6.17 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"},
|
72 |
+
{"id": "openai/whisper-large-v2", "params": "~1.55B", "size": "6.17 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"},
|
73 |
+
{"id": "openai/whisper-large-v3", "params": "~1.55B", "size": "3.09 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"},
|
74 |
+
{"id": "openai/whisper-large-v3-turbo", "params": "~809M", "size": "1.62 GB", "status_en": "Available (Optimized, CPU Slow)", "status_zh": "可用 (優化, CPU 慢)", "type": "whisper"},
|
75 |
+
{"id": PHI4_MODEL_ID, "params": "~5.57B", "size": "11.15 GB", "status_en": "Multimodal (Need Trust, High RAM)", "status_zh": "多模態 (需信任,高RAM)", "type": "phi4"},
|
76 |
+
# {"id": MERALION_MODEL_ID, "params": "~9.93B", "size": "19.85 GB", "status_en": "Experimental (Need Trust, High RAM)", "status_zh": "實驗性 (需信任,高RAM)", "type": "other"},
|
77 |
+
# {"id": SEALLM_MODEL_ID, "params": "~8.29B", "size": "16.57 GB", "status_en": "Experimental (Need Trust, High RAM)", "status_zh": "實驗性 (需信任,高RAM)", "type": "other"},
|
78 |
+
]
|
79 |
+
MODEL_INFO_DICT = {m['id']: m for m in MODEL_DATA}
|
80 |
+
MODEL_CHOICES_WITH_PARAMS = [
|
81 |
+
(f"{m['id'].split('/')[-1]} ({m['params']}, {m['size']}) - {m['status_en']} / {m['status_zh']}", m['id'])
|
82 |
+
for m in MODEL_DATA
|
83 |
+
]
|
84 |
+
DEFAULT_MODEL = "openai/whisper-tiny"
|
85 |
+
|
86 |
+
# --- Language Data ---
|
87 |
+
BILINGUAL_LANGUAGES_DICT = {
|
88 |
+
"auto": "Auto-detect / 自動偵測", "en": "English / 英文", "zh": "Chinese / 中文", "de": "German / 德文", "es": "Spanish / 西班牙文",
|
89 |
+
"ru": "Russian / 俄文", "ko": "Korean / 韓文", "fr": "French / 法文", "ja": "Japanese / 日文", "pt": "Portuguese / 葡萄牙文", "tr": "Turkish / 土耳其文",
|
90 |
+
"pl": "Polish / 波蘭文", "ca": "Catalan / 加泰隆尼亞文", "nl": "Dutch / 荷蘭文", "ar": "Arabic / 阿拉伯文", "sv": "Swedish / 瑞典文", "it": "Italian / 義大利文",
|
91 |
+
"id": "Indonesian / 印尼文", "hi": "Hindi / 印地文", "fi": "Finnish / 芬蘭文", "vi": "Vietnamese / 越南文", "he": "Hebrew / 希伯來文", "uk": "Ukrainian / 烏克蘭文",
|
92 |
+
"el": "Greek / 希臘文", "ms": "Malay / 馬來文", "cs": "Czech / 捷克文", "ro": "Romanian / 羅馬尼亞文", "da": "Danish / 丹麥文", "hu": "Hungarian / 匈牙利文",
|
93 |
+
"ta": "Tamil / 坦米爾文", "no": "Norwegian / 挪威文", "th": "Thai / 泰文", "ur": "Urdu / 烏爾都文", "hr": "Croatian / 克羅埃西亞文", "bg": "Bulgarian / 保加利亞文",
|
94 |
+
"lt": "Lithuanian / 立陶宛文", "la": "Latin / 拉丁文", "mi": "Maori / 毛利文", "ml": "Malayalam / 馬拉雅拉姆文", "cy": "Welsh / 威爾斯文", "sk": "Slovak / 斯洛伐克文",
|
95 |
+
"te": "Telugu / 泰盧固文", "fa": "Persian / 波斯文", "lv": "Latvian / 拉脫維亞文", "bn": "Bengali / 孟加拉文", "sr": "Serbian / 塞爾維亞文", "az": "Azerbaijani / 亞塞拜然文",
|
96 |
+
"sl": "Slovenian / 斯洛維尼亞文", "kn": "Kannada / 坎那達文", "et": "Estonian / 愛沙尼亞文", "mk": "Macedonian / 馬其頓文", "br": "Breton / 布列塔尼文",
|
97 |
+
"eu": "Basque / 巴斯克文", "is": "Icelandic / 冰島文", "hy": "Armenian / 亞美尼亞文", "ne": "Nepali / 尼泊爾文", "mn": "Mongolian / 蒙古文", "bs": "Bosnian / 波士尼亞文",
|
98 |
+
"kk": "Kazakh / 哈薩克文", "sq": "Albanian / 阿爾巴尼亞文", "sw": "Swahili / 史瓦希里文", "gl": "Galician / 加利西亞文", "mr": "Marathi / 馬拉地文", "pa": "Punjabi / 旁遮普文",
|
99 |
+
"si": "Sinhala / 僧伽羅文", "km": "Khmer / 高棉文", "sn": "Shona / 修納文", "yo": "Yoruba / 約魯巴文", "so": "Somali / 索馬利文", "af": "Afrikaans / 南非荷蘭文",
|
100 |
+
"oc": "Occitan / 奧克西坦文", "ka": "Georgian / 喬治亞文", "be": "Belarusian / 白俄羅斯文", "tg": "Tajik / 塔吉克文", "sd": "Sindhi / 信德文", "gu": "Gujarati / 古吉拉特文",
|
101 |
+
"am": "Amharic / 安哈拉文", "yi": "Yiddish / 意第緒文", "lo": "Lao / 寮文", "uz": "Uzbek / 烏茲別克文", "fo": "Faroese / 法羅文", "ht": "Haitian Creole / 海地克里奧爾文",
|
102 |
+
"ps": "Pashto / 普什圖文", "tk": "Turkmen / 土庫曼文", "nn": "Nynorsk / 新挪威文", "mt": "Maltese / 馬爾他文", "sa": "Sanskrit / 梵文", "lb": "Luxembourgish / 盧森堡文",
|
103 |
+
"my": "Myanmar / 緬甸文", "bo": "Tibetan / 藏文", "tl": "Tagalog / 他加祿文", "mg": "Malagasy / 馬達加斯加文", "as": "Assamese / 阿薩姆文", "tt": "Tatar / 韃靼文",
|
104 |
+
"haw": "Hawaiian / 夏威夷文", "ln": "Lingala / 林加拉文", "ha": "Hausa / 豪沙文", "ba": "Bashkir / 巴什基爾文", "jw": "Javanese / 爪哇文", "su": "Sundanese / 巽他文",
|
105 |
+
"yue": "Cantonese / 粵語",
|
106 |
+
}
|
107 |
+
WHISPER_LANGUAGES_LIST = []
|
108 |
+
WHISPER_LANGUAGES_LIST.append((BILINGUAL_LANGUAGES_DICT["auto"], "auto"))
|
109 |
+
def get_english_name(display_name_tuple): return display_name_tuple[0].split('/')[0].strip()
|
110 |
+
sorted_languages = sorted( [(display_name, code) for code, display_name in BILINGUAL_LANGUAGES_DICT.items() if code != "auto"], key=get_english_name )
|
111 |
+
WHISPER_LANGUAGES_LIST.extend(sorted_languages)
|
112 |
+
PHI4_AUDIO_LANG_CODES = ["auto", "en", "zh", "de", "fr", "it", "ja", "es", "pt"]
|
113 |
+
PHI4_LANGUAGES_LIST = [(BILINGUAL_LANGUAGES_DICT.get(code, code), code) for code in PHI4_AUDIO_LANG_CODES]
|
114 |
+
|
115 |
+
# --- Microphone Prompt ---
|
116 |
+
MIC_PROMPT = """**Try Reading / 試著朗讀:**
|
117 |
+
"Success is stumbling from failure to failure with no loss of enthusiasm." - Winston Churchill
|
118 |
+
「成功是在一次又一次失敗中,依然熱情不減地前行。」 - 溫斯頓・邱吉爾"""
|
119 |
+
|
120 |
+
# --- YouTube Audio Download Function ---
|
121 |
+
def download_youtube_audio(url):
|
122 |
+
# 使用固定的目錄來存儲下載的音訊文件,這樣它們就不會被刪除
|
123 |
+
download_dir = os.path.join(tempfile.gettempdir(), "youtube_downloads")
|
124 |
+
os.makedirs(download_dir, exist_ok=True)
|
125 |
+
|
126 |
+
# 從 URL 中提取視頻 ID 作為文件名的一部分
|
127 |
+
video_id = url.split("v=")[-1].split("&")[0] if "v=" in url else str(int(time.time()))
|
128 |
+
filename = f"youtube_{video_id}_{int(time.time())}"
|
129 |
+
|
130 |
+
temp_dir = tempfile.mkdtemp()
|
131 |
+
downloaded_path = None
|
132 |
+
try:
|
133 |
+
temp_filepath_tmpl = os.path.join(download_dir, f"{filename}.%(ext)s")
|
134 |
+
ydl_opts = {
|
135 |
+
'format': 'bestaudio/best',
|
136 |
+
'outtmpl': temp_filepath_tmpl,
|
137 |
+
'noplaylist': True,
|
138 |
+
'quiet': True,
|
139 |
+
'postprocessors': [{'key': 'FFmpegExtractAudio','preferredcodec': 'mp3','preferredquality': '192',}],
|
140 |
+
'ffmpeg_location': shutil.which("ffmpeg"),
|
141 |
+
}
|
142 |
+
if not ydl_opts['ffmpeg_location']: print("Warning: ffmpeg not found... / 警告:找不到 ffmpeg...")
|
143 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
144 |
+
info_dict = ydl.extract_info(url, download=True)
|
145 |
+
duration = info_dict.get('duration')
|
146 |
+
title = info_dict.get('title', 'unknown')
|
147 |
+
|
148 |
+
final_filepath = ydl.prepare_filename(info_dict)
|
149 |
+
if not final_filepath.endswith('.mp3'):
|
150 |
+
base_name = final_filepath.rsplit('.', 1)[0]
|
151 |
+
final_filepath = base_name + '.mp3'
|
152 |
+
|
153 |
+
if os.path.exists(final_filepath):
|
154 |
+
downloaded_path = final_filepath
|
155 |
+
print(f"YouTube audio downloaded: {downloaded_path}")
|
156 |
+
print(f"Title: {title}, Duration: {duration}s")
|
157 |
+
else:
|
158 |
+
potential_files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.startswith(filename) and f.endswith(".mp3")]
|
159 |
+
if potential_files:
|
160 |
+
downloaded_path = potential_files[0]
|
161 |
+
print(f"Warning: Could not find expected MP3, using fallback: {downloaded_path}")
|
162 |
+
duration = None
|
163 |
+
else:
|
164 |
+
raise FileNotFoundError(f"Audio file not found after download in {download_dir}")
|
165 |
+
|
166 |
+
return downloaded_path, temp_dir, duration
|
167 |
+
except Exception as e:
|
168 |
+
print(f"Error processing YouTube URL: {e}")
|
169 |
+
if temp_dir and os.path.exists(temp_dir):
|
170 |
+
try: shutil.rmtree(temp_dir)
|
171 |
+
except Exception as cleanup_e: print(f"Error cleaning temp directory {temp_dir}: {cleanup_e}")
|
172 |
+
return None, None, None
|
173 |
+
|
174 |
+
# --- Timestamp Formatting ---
|
175 |
+
def format_timestamp(seconds):
|
176 |
+
if seconds is None: return "N/A"
|
177 |
+
milliseconds = round(seconds * 1000)
|
178 |
+
seconds_int = int(milliseconds // 1000)
|
179 |
+
milliseconds_rem = milliseconds % 1000
|
180 |
+
minutes = seconds_int // 60
|
181 |
+
seconds_rem = seconds_int % 60
|
182 |
+
hours = minutes // 60
|
183 |
+
minutes_rem = minutes % 60
|
184 |
+
return f"{hours:01d}:{minutes_rem:02d}:{seconds_rem:02d}.{milliseconds_rem:03d}"
|
185 |
+
|
186 |
+
# --- 下載功能 ---
|
187 |
+
def update_download_file(filepath):
|
188 |
+
"""當有音訊檔案時更新下載檔案"""
|
189 |
+
if filepath and os.path.exists(filepath):
|
190 |
+
return filepath
|
191 |
+
return None
|
192 |
+
|
193 |
+
# --- YouTube 音訊處理 ---
|
194 |
+
def process_youtube_url(youtube_url):
|
195 |
+
"""處理 YouTube URL,下載音訊並返回播放器和下載按鈕的更新"""
|
196 |
+
if not youtube_url or not youtube_url.strip():
|
197 |
+
return gr.update(visible=False, value=None), gr.update(visible=False, value=None)
|
198 |
+
|
199 |
+
try:
|
200 |
+
print(f"Processing YouTube URL: {youtube_url}")
|
201 |
+
# 只使用我們需要的返回值
|
202 |
+
audio_path, _, _ = download_youtube_audio(youtube_url)
|
203 |
+
|
204 |
+
if audio_path and os.path.exists(audio_path):
|
205 |
+
# 返回音訊播放器和下載按鈕的更新
|
206 |
+
return gr.update(visible=True, value=audio_path), gr.update(visible=True, value=audio_path)
|
207 |
+
else:
|
208 |
+
return gr.update(visible=False, value=None), gr.update(visible=False, value=None)
|
209 |
+
except Exception as e:
|
210 |
+
print(f"Error processing YouTube URL: {e}")
|
211 |
+
return gr.update(visible=False, value=None), gr.update(visible=False, value=None)
|
212 |
+
|
213 |
+
# --- Load ASR Pipeline ---
|
214 |
+
def load_asr_pipeline(model_id):
|
215 |
+
global pipe, phi4_model, phi4_processor, current_device
|
216 |
+
print(f"DEBUG: Loading ASR pipeline for {model_id} on device: {current_device}")
|
217 |
+
trust_code = model_id in [MERALION_MODEL_ID, SEALLM_MODEL_ID]
|
218 |
+
if trust_code: print(f"DEBUG: Setting trust_remote_code=True for pipeline model {model_id}")
|
219 |
+
try:
|
220 |
+
phi4_model = None
|
221 |
+
phi4_processor = None
|
222 |
+
|
223 |
+
# 根據選擇的設備設置模型加載參數
|
224 |
+
if current_device == "gpu":
|
225 |
+
# 檢查 CUDA 是否可用
|
226 |
+
if torch.cuda.is_available():
|
227 |
+
try:
|
228 |
+
# 嘗試直接使用 CUDA 設備
|
229 |
+
pipe = pipeline(
|
230 |
+
"automatic-speech-recognition",
|
231 |
+
model=model_id,
|
232 |
+
trust_remote_code=trust_code,
|
233 |
+
device="cuda"
|
234 |
+
)
|
235 |
+
# 注意:第一次運行時可能會出現 attention mask 警告,這是正常的,不影響功能
|
236 |
+
print(f"DEBUG: Using GPU (CUDA) for ASR pipeline. Available GPU: {torch.cuda.get_device_name(0)}")
|
237 |
+
except Exception as e:
|
238 |
+
# 如果直接使用 CUDA 失敗,嘗試使用 device=0
|
239 |
+
pipe = pipeline(
|
240 |
+
"automatic-speech-recognition",
|
241 |
+
model=model_id,
|
242 |
+
trust_remote_code=trust_code,
|
243 |
+
device=0
|
244 |
+
)
|
245 |
+
print(f"DEBUG: Using GPU (device=0) for ASR pipeline. Reason for device_map failure: {str(e)}")
|
246 |
+
else:
|
247 |
+
# 如果 CUDA 不可用,回退到 CPU 並警告用戶
|
248 |
+
pipe = pipeline(
|
249 |
+
"automatic-speech-recognition",
|
250 |
+
model=model_id,
|
251 |
+
trust_remote_code=trust_code,
|
252 |
+
device="cpu"
|
253 |
+
)
|
254 |
+
print("WARNING: GPU selected but CUDA is not available. Falling back to CPU.")
|
255 |
+
else: # CPU
|
256 |
+
# 使用 CPU
|
257 |
+
pipe = pipeline(
|
258 |
+
"automatic-speech-recognition",
|
259 |
+
model=model_id,
|
260 |
+
trust_remote_code=trust_code,
|
261 |
+
device="cpu"
|
262 |
+
)
|
263 |
+
# 注意:第一次運行時可能會出現 attention mask 警告,這是正常的,不影響功能
|
264 |
+
print("DEBUG: Using CPU for ASR pipeline.")
|
265 |
+
print(f"DEBUG: Model loaded on device: {pipe.device}")
|
266 |
+
return pipe
|
267 |
+
except Exception as e:
|
268 |
+
print(f"Error loading ASR pipeline for {model_id}:")
|
269 |
+
traceback.print_exc()
|
270 |
+
raise e
|
271 |
+
|
272 |
+
# --- Load Phi-4 Model ---
|
273 |
+
def load_phi4_model(model_id):
|
274 |
+
global pipe, phi4_model, phi4_processor, current_device
|
275 |
+
print(f"DEBUG: Loading Phi-4 model {model_id} on device: {current_device}")
|
276 |
+
try:
|
277 |
+
pipe = None
|
278 |
+
phi4_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
279 |
+
|
280 |
+
# 根據選擇的設備設置模型加載參數
|
281 |
+
if current_device == "gpu":
|
282 |
+
# 檢查 CUDA 是否可用
|
283 |
+
if torch.cuda.is_available():
|
284 |
+
try:
|
285 |
+
# 嘗試直接使用 CUDA 設備
|
286 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
287 |
+
model_id,
|
288 |
+
trust_remote_code=True,
|
289 |
+
torch_dtype=torch.float16, # 使用半精度以節省 GPU 記憶體
|
290 |
+
_attn_implementation="eager",
|
291 |
+
)
|
292 |
+
phi4_model = phi4_model.to("cuda")
|
293 |
+
print(f"DEBUG: Using GPU (CUDA) for Phi-4. Available GPU: {torch.cuda.get_device_name(0)}")
|
294 |
+
except Exception as e:
|
295 |
+
# 如果直接使用 CUDA 失敗,嘗試使用 device=0
|
296 |
+
try:
|
297 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
298 |
+
model_id,
|
299 |
+
trust_remote_code=True,
|
300 |
+
torch_dtype=torch.float16,
|
301 |
+
_attn_implementation="eager",
|
302 |
+
)
|
303 |
+
phi4_model = phi4_model.to("cuda:0")
|
304 |
+
print(f"DEBUG: Using GPU (device=0) for Phi-4. Reason for first attempt failure: {str(e)}")
|
305 |
+
except Exception as e2:
|
306 |
+
# 如果仍然失敗,回退到 CPU
|
307 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
308 |
+
model_id,
|
309 |
+
trust_remote_code=True,
|
310 |
+
torch_dtype=torch.float32,
|
311 |
+
_attn_implementation="eager",
|
312 |
+
)
|
313 |
+
phi4_model = phi4_model.to("cpu")
|
314 |
+
print(f"WARNING: Failed to use GPU for Phi-4, falling back to CPU. Error: {str(e2)}")
|
315 |
+
else:
|
316 |
+
# 如果 CUDA 不可用,回退到 CPU 並警告用戶
|
317 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
318 |
+
model_id,
|
319 |
+
trust_remote_code=True,
|
320 |
+
torch_dtype=torch.float32, # CPU 通常使用全精度
|
321 |
+
_attn_implementation="eager",
|
322 |
+
)
|
323 |
+
phi4_model = phi4_model.to("cpu")
|
324 |
+
print("WARNING: GPU selected but CUDA is not available. Falling back to CPU for Phi-4.")
|
325 |
+
else: # CPU
|
326 |
+
# 使用 CPU
|
327 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
328 |
+
model_id,
|
329 |
+
trust_remote_code=True,
|
330 |
+
torch_dtype=torch.float32, # CPU 通常使用全精度
|
331 |
+
_attn_implementation="eager",
|
332 |
+
)
|
333 |
+
phi4_model = phi4_model.to("cpu")
|
334 |
+
print("DEBUG: Using CPU for Phi-4.")
|
335 |
+
|
336 |
+
print(f"DEBUG: Phi-4 model loaded on device: {next(phi4_model.parameters()).device}")
|
337 |
+
return phi4_model, phi4_processor
|
338 |
+
except Exception as e:
|
339 |
+
print(f"Error loading Phi-4 model {model_id}:")
|
340 |
+
traceback.print_exc()
|
341 |
+
if "scipy" in str(e) or "torchvision" in str(e) or "peft" in str(e):
|
342 |
+
missing_pkg = "scipy" if "scipy" in str(e) else "torchvision" if "torchvision" in str(e) else "peft"
|
343 |
+
raise type(e)(f"{e}. Please ensure '{missing_pkg}' is in requirements.txt") from e
|
344 |
+
else: raise e
|
345 |
+
|
346 |
+
# --- Main Transcription Function ---
|
347 |
+
def transcribe_audio(mic_input, file_input, youtube_url, selected_model_identifier,
|
348 |
+
task, language, return_timestamps,
|
349 |
+
phi4_prompt_text, device_choice,
|
350 |
+
previous_output_text, active_tab):
|
351 |
+
global pipe, phi4_model, phi4_processor, current_model_name, current_device
|
352 |
+
audio_source = None
|
353 |
+
source_type_en = ""
|
354 |
+
source_type_zh = ""
|
355 |
+
temp_dir_to_clean = None
|
356 |
+
audio_duration = None
|
357 |
+
model_name_for_display = selected_model_identifier
|
358 |
+
model_load_time = 0.0
|
359 |
+
inference_time = 0.0
|
360 |
+
model_type = MODEL_INFO_DICT.get(selected_model_identifier, {}).get("type", "other")
|
361 |
+
output_text_accumulated = previous_output_text if previous_output_text else ""
|
362 |
+
status_update_prefix = output_text_accumulated + ("\n\n---\n\n" if output_text_accumulated else "")
|
363 |
+
final_output_text = output_text_accumulated
|
364 |
+
|
365 |
+
# 更新設備選���
|
366 |
+
if device_choice != current_device:
|
367 |
+
current_device = device_choice
|
368 |
+
print(f"DEBUG: Device changed to {current_device}")
|
369 |
+
# 設備變更時強制重新加載模型
|
370 |
+
pipe = None
|
371 |
+
phi4_model = None
|
372 |
+
phi4_processor = None
|
373 |
+
current_model_name = None
|
374 |
+
|
375 |
+
# --- Load Model ---
|
376 |
+
model_changed = selected_model_identifier != current_model_name
|
377 |
+
model_needs_load = (model_type == "phi4" and phi4_model is None) or (model_type != "phi4" and pipe is None)
|
378 |
+
|
379 |
+
if model_changed or model_needs_load:
|
380 |
+
warning_message = ""
|
381 |
+
# 移除未使用的 trust_code 變量
|
382 |
+
if selected_model_identifier in [PHI4_MODEL_ID, MERALION_MODEL_ID, SEALLM_MODEL_ID]:
|
383 |
+
warning_message += f"Warning: Model {selected_model_identifier} requires executing remote code.\n警告: 模型 {selected_model_identifier} 需要執行遠端程式碼。\n"
|
384 |
+
if "seallms" in selected_model_identifier.lower() or "meralion" in selected_model_identifier.lower(): warning_message += f"Warning: Model {selected_model_identifier} likely requires >16GB RAM.\n警告: 模型 {selected_model_identifier} 可能需要 >16GB RAM。\n"
|
385 |
+
if model_type == "phi4": warning_message += f"Warning: Phi-4 uses a different process.\n警告: Phi-4 使用不同處理流程。\n"
|
386 |
+
print(f"Attempting to load model / 嘗試載入模型: {selected_model_identifier} (Type / 類型: {model_type})")
|
387 |
+
status_update_str = warning_message + f"Loading model / 正在載入模型: {selected_model_identifier}..."
|
388 |
+
# 不使用 yield,而是更新 output_text_accumulated
|
389 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
390 |
+
|
391 |
+
load_start_time = time.monotonic()
|
392 |
+
try:
|
393 |
+
if model_type == "phi4":
|
394 |
+
phi4_model, phi4_processor = load_phi4_model(selected_model_identifier)
|
395 |
+
pipe = None
|
396 |
+
else:
|
397 |
+
pipe = load_asr_pipeline(selected_model_identifier)
|
398 |
+
phi4_model = None
|
399 |
+
phi4_processor = None
|
400 |
+
load_end_time = time.monotonic()
|
401 |
+
model_load_time = load_end_time - load_start_time
|
402 |
+
current_model_name = selected_model_identifier
|
403 |
+
model_name_for_display = current_model_name
|
404 |
+
print(f"Model {current_model_name} loaded successfully ({model_load_time:.2f}s). / 模型 {current_model_name} 載入成功 ({model_load_time:.2f} 秒).")
|
405 |
+
status_update_str = warning_message + f"Model {current_model_name} loaded successfully / 載入成功 ({model_load_time:.2f}s)."
|
406 |
+
# 更新 output_text_accumulated
|
407 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
408 |
+
except Exception as e:
|
409 |
+
load_end_time = time.monotonic()
|
410 |
+
model_load_time = load_end_time - load_start_time
|
411 |
+
print(f"Failed to load model {selected_model_identifier} ({model_load_time:.2f}s). / 載入模型 {selected_model_identifier} 失敗 ({model_load_time:.2f} 秒).")
|
412 |
+
error_msg = f"Error: Failed to load model {selected_model_identifier}:\n錯誤: 載入模型 {selected_model_identifier} 失敗:\n{e}\n({model_load_time:.2f}s)"
|
413 |
+
if "requires `accelerate`" in str(e): error_msg += "\n**Missing 'accelerate'. Please install. / 缺少 'accelerate',請安裝.**"
|
414 |
+
if isinstance(e, (MemoryError, RuntimeError)) and "out of memory" in str(e).lower(): error_msg += "\n**Out of Memory. Try a smaller model. / 記憶體不足,請嘗試較小模型.**"
|
415 |
+
if "trust_remote_code=True" in str(e): error_msg += "\n**Requires trusting remote code. Model might be unsafe. / 需要信任遠端代碼,模型可能不安全.**"
|
416 |
+
if "scipy" in str(e) or "torchvision" in str(e) or "peft" in str(e):
|
417 |
+
missing_pkg = "scipy" if "scipy" in str(e) else "torchvision" if "torchvision" in str(e) else "peft"
|
418 |
+
error_msg += f"\n**Missing '{missing_pkg}'. Please install. / 缺少 '{missing_pkg}',請安裝.**"
|
419 |
+
status_update_str = warning_message + error_msg
|
420 |
+
pipe = None
|
421 |
+
phi4_model = None
|
422 |
+
phi4_processor = None
|
423 |
+
current_model_name = None
|
424 |
+
# 更新 output_text_accumulated
|
425 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
426 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) # Keep inputs
|
427 |
+
|
428 |
+
# --- Check if model loaded ---
|
429 |
+
if (model_type == "phi4" and phi4_model is None) or (model_type != "phi4" and pipe is None):
|
430 |
+
output_text_accumulated = status_update_prefix + "Error: Cannot use model. / 錯誤: 無法使用模型."
|
431 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
432 |
+
|
433 |
+
# --- Determine Input Source & Get Duration ---
|
434 |
+
# 根據當前活動的標籤選擇正確的輸入源
|
435 |
+
print(f"DEBUG: Active tab is {active_tab}")
|
436 |
+
|
437 |
+
if active_tab == "mic" and mic_input is not None:
|
438 |
+
audio_source = mic_input
|
439 |
+
source_type_en = "Microphone"
|
440 |
+
source_type_zh = "麥克風"
|
441 |
+
elif active_tab == "file" and file_input is not None:
|
442 |
+
# 處理 File 組件的輸出,它可能是一個文件路徑或一個包含文件路徑的列表
|
443 |
+
if isinstance(file_input, list) and len(file_input) > 0:
|
444 |
+
# 如果是列表,取第一個文件
|
445 |
+
audio_source = file_input[0]
|
446 |
+
else:
|
447 |
+
# 否則直接使用
|
448 |
+
audio_source = file_input
|
449 |
+
source_type_en = "File Upload"
|
450 |
+
source_type_zh = "檔案上傳"
|
451 |
+
elif active_tab == "youtube" and youtube_url and youtube_url.strip():
|
452 |
+
source_type_en = "YouTube"
|
453 |
+
source_type_zh = "YouTube"
|
454 |
+
status_update_str = f"Downloading YouTube Audio / 正在下載 YouTube 音訊..."
|
455 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
456 |
+
audio_path, temp_dir, duration_yt = download_youtube_audio(youtube_url)
|
457 |
+
if audio_path:
|
458 |
+
audio_source = audio_path
|
459 |
+
temp_dir_to_clean = temp_dir
|
460 |
+
audio_duration = duration_yt
|
461 |
+
else:
|
462 |
+
output_text_accumulated = status_update_prefix + "Error: Failed to download YouTube audio. / 錯誤:無法下載 YouTube 音訊。"
|
463 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
464 |
+
else:
|
465 |
+
# 如果沒有選擇任何輸入源或當前標籤沒有有效輸入
|
466 |
+
return (previous_output_text, gr.update(), gr.update(), gr.update()) # No input
|
467 |
+
|
468 |
+
if audio_source is None:
|
469 |
+
output_text_accumulated = status_update_prefix + f"Error: No audio file provided. / 錯誤:未提供音訊檔案."
|
470 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
471 |
+
|
472 |
+
# 確保音頻文件存在
|
473 |
+
if not os.path.exists(audio_source):
|
474 |
+
output_text_accumulated = status_update_prefix + f"Error: Audio file not found '{audio_source}'. / 錯誤:找不到音訊檔案 '{audio_source}'."
|
475 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
476 |
+
|
477 |
+
# 檢查文件是否為有效的音頻文件
|
478 |
+
valid_audio_extensions = ['.wav', '.mp3', '.ogg', '.flac', '.m4a', '.aac']
|
479 |
+
file_ext = os.path.splitext(audio_source)[1].lower()
|
480 |
+
if file_ext not in valid_audio_extensions:
|
481 |
+
output_text_accumulated = status_update_prefix + f"Error: Invalid audio file format '{file_ext}'. / 錯誤:無效的音訊檔案格式 '{file_ext}'."
|
482 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
483 |
+
|
484 |
+
if audio_duration is None:
|
485 |
+
try:
|
486 |
+
# 根據文件格式選擇適當的方法獲取音頻時長
|
487 |
+
if file_ext == '.wav':
|
488 |
+
# 對於 WAV 文件,使用 wave 模塊
|
489 |
+
import wave
|
490 |
+
try:
|
491 |
+
with wave.open(audio_source, 'rb') as wf:
|
492 |
+
frames = wf.getnframes()
|
493 |
+
rate = wf.getframerate()
|
494 |
+
audio_duration = frames / float(rate)
|
495 |
+
print(f"Got audio duration from wave module / 從 wave 模塊獲取音檔時長: {audio_duration:.2f}s")
|
496 |
+
except Exception as wave_err:
|
497 |
+
print(f"Could not get audio duration from wave module / 無法從 wave 模塊獲取音檔時長: {wave_err}")
|
498 |
+
# 如果 wave 模塊失敗,嘗試使用 soundfile
|
499 |
+
info = sf.info(audio_source)
|
500 |
+
audio_duration = info.duration
|
501 |
+
print(f"Got audio duration from soundfile / 從 soundfile 獲取音檔時長: {audio_duration:.2f}s")
|
502 |
+
else:
|
503 |
+
# 對於其他格式,使用 soundfile
|
504 |
+
info = sf.info(audio_source)
|
505 |
+
audio_duration = info.duration
|
506 |
+
print(f"Got audio duration from soundfile / 從 soundfile 獲取音檔時長: {audio_duration:.2f}s")
|
507 |
+
except Exception as e:
|
508 |
+
print(f"Could not get audio duration / 無法獲取音檔時長: {e}")
|
509 |
+
# 如果無法獲取時長,設置一個默認值
|
510 |
+
audio_duration = 0.0
|
511 |
+
print(f"Using default audio duration / 使用默認音檔時長: {audio_duration:.2f}s")
|
512 |
+
|
513 |
+
print(f"Processing with {current_model_name} from [{source_type_en} / {source_type_zh}]: {audio_source}")
|
514 |
+
print(f"Options: Task='{task}', Language(Source)='{language}', Timestamps='{return_timestamps}'")
|
515 |
+
if model_type == "phi4": print(f"Phi-4 Prompt: '{phi4_prompt_text}'")
|
516 |
+
|
517 |
+
status_update_str = f"Processing, please wait... / 正在處理,請稍候...\n(Model / 模型: {model_name_for_display})"
|
518 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
519 |
+
|
520 |
+
# --- Execute & Timing ---
|
521 |
+
inference_start_time = time.monotonic()
|
522 |
+
current_run_output = ""
|
523 |
+
timing_info_str = ""
|
524 |
+
try:
|
525 |
+
if model_type == "phi4":
|
526 |
+
print("DEBUG: Processing with Phi-4...")
|
527 |
+
if not phi4_model or not phi4_processor: raise ValueError("Phi-4 model/processor not loaded / Phi-4 模型/處理器未載入")
|
528 |
+
if not phi4_prompt_text: raise ValueError("Phi-4 requires a prompt text / Phi-4 需要提示文字")
|
529 |
+
user_prompt_tag='<|user|>'
|
530 |
+
assistant_prompt_tag='<|assistant|>'
|
531 |
+
end_tag='<|end|>'
|
532 |
+
prompt = f"{user_prompt_tag}<|audio_1|>{phi4_prompt_text}{end_tag}{assistant_prompt_tag}"
|
533 |
+
audio_data, samplerate = sf.read(audio_source)
|
534 |
+
inputs = phi4_processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(phi4_model.device)
|
535 |
+
with torch.no_grad(): generate_ids = phi4_model.generate(**inputs, max_new_tokens=500, num_logits_to_keep=0) # Added num_logits_to_keep=0
|
536 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
537 |
+
result_text = phi4_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
538 |
+
current_run_output = result_text.strip()
|
539 |
+
return_timestamps = False
|
540 |
+
else: # Whisper or other pipeline models
|
541 |
+
print("DEBUG: Processing with ASR pipeline...")
|
542 |
+
if not pipe: raise ValueError("ASR pipeline not loaded / ASR pipeline 未載入")
|
543 |
+
generate_kwargs_pipe = {"task": task}
|
544 |
+
|
545 |
+
# 根據任務處理語言參數
|
546 |
+
if task == "transcribe":
|
547 |
+
# 在轉錄任務中,language 表示源語言
|
548 |
+
if language != "auto":
|
549 |
+
generate_kwargs_pipe["language"] = language
|
550 |
+
print(f"DEBUG: Setting source language to {language} for transcription")
|
551 |
+
else: # translate
|
552 |
+
# 在翻譯任務中,Whisper 只支持翻譯為英文,所以我們忽略 language 參數
|
553 |
+
# 但我們仍然可以在日誌中記錄目標語言
|
554 |
+
print(f"DEBUG: Translation target language is {language}, but Whisper only supports English as target")
|
555 |
+
|
556 |
+
# 設置 pipeline 參數
|
557 |
+
pipeline_kwargs = {
|
558 |
+
"chunk_length_s": 30,
|
559 |
+
"batch_size": 1,
|
560 |
+
"return_timestamps": "chunks" if return_timestamps else False,
|
561 |
+
"generate_kwargs": generate_kwargs_pipe
|
562 |
+
}
|
563 |
+
|
564 |
+
# 使用 pipeline 調用處理音訊
|
565 |
+
# 注意:第一次運行時可能會出現 attention mask 警告,這是正常的,不影響功能
|
566 |
+
# 第二次及後續運行不會出現警告,且處理速度會更快
|
567 |
+
result = pipe(audio_source, **pipeline_kwargs)
|
568 |
+
|
569 |
+
print("DEBUG: pipe() call finished.")
|
570 |
+
print("DEBUG: Raw result type:", type(result))
|
571 |
+
print("DEBUG: Raw result content:", result)
|
572 |
+
|
573 |
+
# 處理不同格式的結果
|
574 |
+
if return_timestamps and isinstance(result, dict) and "chunks" in result:
|
575 |
+
formatted_chunks = [f"[{format_timestamp(chunk.get('timestamp', (None,))[0])} -> {format_timestamp(chunk.get('timestamp', (None, None))[1])}] {chunk.get('text', '').strip()}" for chunk in result["chunks"]]
|
576 |
+
current_run_output = "\n".join(formatted_chunks).strip()
|
577 |
+
elif isinstance(result, dict) and "text" in result:
|
578 |
+
current_run_output = result["text"].strip()
|
579 |
+
elif isinstance(result, str):
|
580 |
+
current_run_output = result.strip()
|
581 |
+
elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict) and 'generated_text' in result[0]:
|
582 |
+
current_run_output = result[0]['generated_text'].strip()
|
583 |
+
else:
|
584 |
+
current_run_output = f"(Unrecognized result format / 無法識別的結果格式: {type(result)})"
|
585 |
+
|
586 |
+
print("DEBUG: Processed result:", current_run_output[:100] + "..." if len(current_run_output) > 100 else current_run_output)
|
587 |
+
|
588 |
+
inference_end_time = time.monotonic()
|
589 |
+
inference_time = inference_end_time - inference_start_time
|
590 |
+
if not current_run_output: current_run_output = "(Audio empty or unrecognizable / 音檔空白或無法辨識)"
|
591 |
+
|
592 |
+
# --- Format Timing Info (Plain Text, EN / ZH) ---
|
593 |
+
timing_info_str = f"Model / 模型: {model_name_for_display}\n"
|
594 |
+
if model_load_time > 0: timing_info_str += f"Model Load Time / 模型載入時間: {model_load_time:.2f} seconds / 秒\n"
|
595 |
+
timing_info_str += f"Inference Time / 推論時間: {inference_time:.2f} seconds / 秒\n"
|
596 |
+
relative_speed_str = "(relative time unavailable / 無法計算相對時間)"
|
597 |
+
if audio_duration is not None and audio_duration > 0:
|
598 |
+
relative_speed = inference_time / audio_duration
|
599 |
+
# Corrected format for relative speed
|
600 |
+
relative_speed_str = f"audio duration / 音檔長度 x {relative_speed:.2f}"
|
601 |
+
timing_info_str += f"audio duration / 音檔時長: {audio_duration:.2f} seconds / 秒\n"
|
602 |
+
timing_info_str += f"relative speed / 相對速度: {relative_speed_str}" # Corrected format
|
603 |
+
|
604 |
+
print(f"Processing finished. / 處理完成。")
|
605 |
+
print(timing_info_str.replace('\n', ' | '))
|
606 |
+
print(f"Result Text / 結果文字:\n{current_run_output}") # Print result text
|
607 |
+
|
608 |
+
# 確保 current_run_output 不為空
|
609 |
+
if not current_run_output or current_run_output.strip() == "":
|
610 |
+
current_run_output = "No text detected in audio / 音頻中未檢測到文字"
|
611 |
+
|
612 |
+
# 構建最終輸出文本,確保包含所有必要信息
|
613 |
+
final_output_text = ""
|
614 |
+
if status_update_prefix and status_update_prefix.strip():
|
615 |
+
final_output_text += status_update_prefix + "\n"
|
616 |
+
|
617 |
+
# 添加模型和時間信息
|
618 |
+
final_output_text += timing_info_str + "\n\n"
|
619 |
+
|
620 |
+
# 添加結果文本,並確保它被正確標記
|
621 |
+
final_output_text += "Result Text / 結果文字:\n" + current_run_output
|
622 |
+
|
623 |
+
# 確保最終輸出不是空的或只有一個點
|
624 |
+
final_output_text = final_output_text.strip()
|
625 |
+
if final_output_text == "." or not final_output_text:
|
626 |
+
final_output_text = timing_info_str + "\n\nResult Text / 結果文字:\n" + current_run_output
|
627 |
+
|
628 |
+
# 返回完整的文本結果,包括模型信息和處理時間
|
629 |
+
# 確保返回的是有效的文本,而不是單個點
|
630 |
+
if final_output_text == ".":
|
631 |
+
print("DEBUG: Detected dot-only output, fixing...")
|
632 |
+
# 構建更有意義的輸出
|
633 |
+
fixed_output = f"{timing_info_str}\n\nResult Text / 結果文字:\n{current_run_output}"
|
634 |
+
return fixed_output
|
635 |
+
return final_output_text
|
636 |
+
|
637 |
+
except Exception as e:
|
638 |
+
inference_end_time = time.monotonic()
|
639 |
+
inference_time = inference_end_time - inference_start_time
|
640 |
+
print(f"DEBUG: Exception occurred during processing / 處理過程中發生錯誤:")
|
641 |
+
traceback.print_exc()
|
642 |
+
error_message = f"Processing Failed / 處理失敗:\n{e}"
|
643 |
+
final_output_text = (status_update_prefix + error_message).strip()
|
644 |
+
timing_info_str = f"Model / 模型: {model_name_for_display}\n"
|
645 |
+
if model_load_time > 0: timing_info_str += f"Model Load Time / 模型載入時間: {model_load_time:.2f} seconds / 秒\n"
|
646 |
+
timing_info_str += f"Inference Time (until error) / 推論時間 (至錯誤): {inference_time:.2f} seconds / 秒\n"
|
647 |
+
timing_info_str += "Processing Failed / 處理失敗"
|
648 |
+
final_output_text += "\n\n" + timing_info_str
|
649 |
+
if isinstance(e, (MemoryError, RuntimeError)) and "out of memory" in str(e).lower(): final_output_text += "\n\nOut of Memory, try smaller model. / 記憶體不足,請用小模型."
|
650 |
+
|
651 |
+
finally:
|
652 |
+
if temp_dir_to_clean:
|
653 |
+
print(f"Cleaning YouTube temp files / 清理 YouTube 暫存: {temp_dir_to_clean}")
|
654 |
+
# Corrected finally block syntax
|
655 |
+
try:
|
656 |
+
shutil.rmtree(temp_dir_to_clean)
|
657 |
+
except Exception as e:
|
658 |
+
print(f"Failed to clean temp files / 清理暫存失敗: {e}")
|
659 |
+
|
660 |
+
print("DEBUG: Returning final result tuple...")
|
661 |
+
# Return final tuple: Update output_text, KEEP inputs by using gr.update()
|
662 |
+
# 如果 final_output_text 是字典(ASR pipeline 的輸出),直接返回它
|
663 |
+
# 否則,返回標準的元組格式
|
664 |
+
if isinstance(final_output_text, dict):
|
665 |
+
return final_output_text
|
666 |
+
else:
|
667 |
+
return (final_output_text, gr.update(), gr.update(), gr.update())
|
668 |
+
|
669 |
+
|
670 |
+
# --- UI Update Functions ---
|
671 |
+
# 添加一個函數來更新音頻播放器
|
672 |
+
def update_file_audio_player(file_path):
|
673 |
+
if file_path is None:
|
674 |
+
return gr.update(value=None, visible=False)
|
675 |
+
|
676 |
+
# 如果是列表,取第一個文件
|
677 |
+
if isinstance(file_path, list) and len(file_path) > 0:
|
678 |
+
file_path = file_path[0]
|
679 |
+
|
680 |
+
# 檢查文件是否存在
|
681 |
+
if not os.path.exists(file_path):
|
682 |
+
return gr.update(value=None, visible=False)
|
683 |
+
|
684 |
+
# 檢查是否為有效的音頻文件
|
685 |
+
valid_audio_extensions = ['.wav', '.mp3', '.ogg', '.flac', '.m4a', '.aac']
|
686 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
687 |
+
if file_ext not in valid_audio_extensions:
|
688 |
+
return gr.update(value=None, visible=False)
|
689 |
+
|
690 |
+
# 返回更新的音頻播放器
|
691 |
+
return gr.update(value=file_path, visible=True)
|
692 |
+
|
693 |
+
def update_task_choices(selected_model_id):
|
694 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
695 |
+
if model_type == "whisper": new_choices = [ ("Transcribe / 轉錄", "transcribe"), ("Translate (Whisper only to English) / 翻譯 (Whisper 僅支援轉譯至英文)", "translate") ]
|
696 |
+
else: new_choices = [ ("Transcribe / 轉錄", "transcribe"), ("Translate / 轉譯", "translate") ]
|
697 |
+
return gr.update(choices=new_choices)
|
698 |
+
|
699 |
+
def update_phi4_prompt_ui(selected_model_id, task, language_code):
|
700 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
701 |
+
is_phi4 = model_type == "phi4"
|
702 |
+
prompt_text = ""
|
703 |
+
if is_phi4:
|
704 |
+
if task == "transcribe":
|
705 |
+
if language_code == "auto":
|
706 |
+
prompt_text = "Transcribe the audio to text."
|
707 |
+
else:
|
708 |
+
lang_display_name = BILINGUAL_LANGUAGES_DICT.get(language_code, language_code)
|
709 |
+
lang_english_name = lang_display_name.split('/')[0].strip()
|
710 |
+
prompt_text = f"Transcribe the audio in {lang_english_name}."
|
711 |
+
elif task == "translate":
|
712 |
+
# 在翻譯任務中,language_code 表示目標語言
|
713 |
+
lang_display_name = BILINGUAL_LANGUAGES_DICT.get(language_code, language_code)
|
714 |
+
lang_english_name = lang_display_name.split('/')[0].strip()
|
715 |
+
if language_code == "auto" or language_code == "en":
|
716 |
+
# 如果目標語言是自動或英文,默認翻譯為英文
|
717 |
+
prompt_text = "Translate the audio to text."
|
718 |
+
else:
|
719 |
+
# 如果指定了目標語言,翻譯為該語言
|
720 |
+
prompt_text = f"Detect the language in the audio and translate it to {lang_english_name}."
|
721 |
+
# Return update for Textbox visibility and value directly
|
722 |
+
return gr.update(visible=is_phi4, value=prompt_text)
|
723 |
+
|
724 |
+
def update_language_choices(selected_model_id):
|
725 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
726 |
+
if model_type == "phi4": return gr.update(choices=PHI4_LANGUAGES_LIST, value="auto")
|
727 |
+
else: return gr.update(choices=WHISPER_LANGUAGES_LIST, value="auto")
|
728 |
+
|
729 |
+
def update_timestamp_visibility(selected_model_id):
|
730 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
731 |
+
print(f"DEBUG: Updating timestamp visibility for {selected_model_id}. Type: {model_type}. Visible: {model_type != 'phi4'}") # Debug print
|
732 |
+
return gr.update(visible=(model_type != "phi4"))
|
733 |
+
|
734 |
+
def update_language_ui(model_id, task):
|
735 |
+
"""根據模型和任務更新語言選擇器的標籤和可見性"""
|
736 |
+
model_type = MODEL_INFO_DICT.get(model_id, {}).get("type", "other")
|
737 |
+
|
738 |
+
# 如果是 Whisper 模型且任務是翻譯,則隱藏語言選擇器(因為 Whisper 只能翻譯成英文)
|
739 |
+
if model_type == "whisper" and task == "translate":
|
740 |
+
return gr.update(visible=False, label="Target Language / 目標語言")
|
741 |
+
|
742 |
+
# 否則,根據任務更新標籤並顯示
|
743 |
+
if task == "transcribe":
|
744 |
+
return gr.update(visible=True, label="Source Language / 來源語言")
|
745 |
+
else: # translate
|
746 |
+
return gr.update(visible=True, label="Target Language / 目標語言")
|
747 |
+
|
748 |
+
# --- Gradio Interface ---
|
749 |
+
# Preserving user's CSS choices
|
750 |
+
compact_css = """
|
751 |
+
.tabitem { margin: 0rem !important; padding: 0rem !important;}
|
752 |
+
.compact-file > div { min-height: unset !important; }
|
753 |
+
"""
|
754 |
+
|
755 |
+
# 移除 JavaScript 代碼,改用純 CSS 解決方案
|
756 |
+
|
757 |
+
with gr.Blocks(css=compact_css, theme=gr.themes.Default(spacing_size=gr.themes.sizes.spacing_sm, text_size=gr.themes.sizes.text_sm)) as demo:
|
758 |
+
# 只顯示標題,不顯示 GPU 狀態
|
759 |
+
gr.Markdown("# Automatic Speech Recognition(ASR) & Speech to Text(STT) / 語音辨識、語音轉文字 🔊🔄📝\nUse AI models to transcribe or translate speech from microphone, file uploads, or YouTube. / 使用 AI 模型轉錄或翻譯來自麥克風、上傳檔案或 YouTube 的語音。")
|
760 |
+
|
761 |
+
with gr.Row():
|
762 |
+
# Left Column: Input & Options
|
763 |
+
with gr.Column(scale=4): # Preserving user's scale
|
764 |
+
# 添加一個隱藏的狀態變量來跟踪當前活動的標籤
|
765 |
+
active_tab = gr.State(value="mic") # 默認為麥克風標籤
|
766 |
+
|
767 |
+
# 定義標籤切換函數
|
768 |
+
def set_active_tab(tab_name):
|
769 |
+
return tab_name
|
770 |
+
|
771 |
+
with gr.Tabs() as tabs:
|
772 |
+
with gr.TabItem("🎤 Microphone / 麥克風") as mic_tab:
|
773 |
+
gr.Markdown(MIC_PROMPT, elem_classes="compact-markdown")
|
774 |
+
mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio / 錄製音訊")
|
775 |
+
download_output = gr.File(label="Download Recording / 下載錄音檔", interactive=False, elem_classes="compact-file")
|
776 |
+
|
777 |
+
with gr.TabItem("📁 Upload File / 上傳檔案") as file_tab:
|
778 |
+
# 使用 File 組件代替 Audio 組件,避免音頻處理問題
|
779 |
+
file_input = gr.File(label="Upload Audio File / 上傳音訊檔", file_types=["audio"], type="filepath")
|
780 |
+
|
781 |
+
# 添加音頻播放器
|
782 |
+
file_audio_player = gr.Audio(label="Audio Preview / 音訊預覽", interactive=False, visible=False)
|
783 |
+
|
784 |
+
with gr.TabItem("▶️ YouTube") as youtube_tab:
|
785 |
+
youtube_input = gr.Textbox(label="YouTube URL / 網址", placeholder="Paste YouTube link here / 在此貼上 YouTube 連結")
|
786 |
+
gr.Examples(examples=[["https://www.youtube.com/watch?v=5D7l0tqQJ7k"]], inputs=[youtube_input], label="Example YouTube URL / 範例 YouTube 網址")
|
787 |
+
|
788 |
+
# 添加 YouTube 音訊播放器和下載按鈕
|
789 |
+
with gr.Row():
|
790 |
+
youtube_audio_player = gr.Audio(label="YouTube Audio / YouTube 音訊", interactive=False, visible=False)
|
791 |
+
youtube_download = gr.File(label="Download YouTube Audio / 下載 YouTube 音訊", interactive=False, visible=False, elem_classes="compact-file")
|
792 |
+
|
793 |
+
# 添加標籤切換事件
|
794 |
+
mic_tab.select(fn=lambda: set_active_tab("mic"), inputs=[], outputs=[active_tab])
|
795 |
+
file_tab.select(fn=lambda: set_active_tab("file"), inputs=[], outputs=[active_tab])
|
796 |
+
youtube_tab.select(fn=lambda: set_active_tab("youtube"), inputs=[], outputs=[active_tab])
|
797 |
+
|
798 |
+
# Options in a single column with ID for spacing
|
799 |
+
with gr.Column(elem_id="options-block"): # elem_id for CSS targeting if needed
|
800 |
+
model_select = gr.Dropdown(choices=MODEL_CHOICES_WITH_PARAMS, label="Model / 模型", value=DEFAULT_MODEL, elem_classes="compact-label")
|
801 |
+
# 獲取硬體信息並顯示具體的 CPU 和 GPU 型號
|
802 |
+
cpu_info, gpu_info = get_hardware_info()
|
803 |
+
device_choices = [(f"CPU ({cpu_info})", "cpu")]
|
804 |
+
if torch.cuda.is_available() and gpu_info:
|
805 |
+
device_choices.append((f"GPU ({gpu_info})", "gpu"))
|
806 |
+
device_input = gr.Radio(choices=device_choices, label="Device / 設備", value="cpu", elem_classes="compact-label radio-align")
|
807 |
+
task_input = gr.Radio(choices=[("Transcribe / 轉錄", "transcribe"), ("Translate / 轉譯", "translate")], label="Task / 任務", value="transcribe", elem_classes="compact-label radio-align")
|
808 |
+
language_input = gr.Dropdown(choices=WHISPER_LANGUAGES_LIST, label="Source Language / 來源語言", value="auto", elem_classes="compact-label")
|
809 |
+
# Phi-4 prompt directly in the column, no Accordion
|
810 |
+
phi4_prompt_input = gr.Textbox(label="Only for Phi-4 Prompt / 僅用於 Phi-4 指令", placeholder="e.g., Transcribe the audio to text.", lines=1, visible=False, elem_classes="compact-label") # Preserving user label and params
|
811 |
+
timestamp_input = gr.Checkbox(label="Show Timestamps / 顯示時間戳", value=False, elem_classes="compact-label checkbox-align") # Preserving user label
|
812 |
+
|
813 |
+
# Right Column: Output
|
814 |
+
with gr.Column(scale=6): # Preserving user's scale
|
815 |
+
submit_button = gr.Button("Submit / 提交", variant="primary") # Preserving user's text and placement
|
816 |
+
output_text = gr.Textbox(
|
817 |
+
label="Result / 結果",
|
818 |
+
lines=25, # 設置顯示的行數
|
819 |
+
max_lines=25, # 設置最大行數,超過會顯示滾動條
|
820 |
+
interactive=True,
|
821 |
+
placeholder="Results appear here (new results appended). / 結果將顯示在此 (新結果會附加在後面)",
|
822 |
+
elem_classes="result-textbox", # 保留 CSS 類
|
823 |
+
autoscroll=False # 不自動滾動到底部,讓用戶可以控制滾動
|
824 |
+
)
|
825 |
+
|
826 |
+
# --- Event Listeners ---
|
827 |
+
model_select.change(fn=update_language_choices, inputs=model_select, outputs=language_input)
|
828 |
+
model_select.change(fn=update_task_choices, inputs=[model_select], outputs=[task_input])
|
829 |
+
# Link prompt update function correctly
|
830 |
+
model_select.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input])
|
831 |
+
task_input.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input])
|
832 |
+
language_input.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input])
|
833 |
+
|
834 |
+
# 根據模型和任務更新語言選擇器
|
835 |
+
task_input.change(fn=update_language_ui, inputs=[model_select, task_input], outputs=language_input)
|
836 |
+
model_select.change(fn=update_language_ui, inputs=[model_select, task_input], outputs=language_input)
|
837 |
+
# Link timestamp visibility function
|
838 |
+
model_select.change(fn=update_timestamp_visibility, inputs=model_select, outputs=timestamp_input)
|
839 |
+
|
840 |
+
# 連接下載按鈕功能
|
841 |
+
mic_input.change(fn=update_download_file, inputs=mic_input, outputs=download_output)
|
842 |
+
|
843 |
+
# 連接文件上傳音頻播放器
|
844 |
+
file_input.change(fn=update_file_audio_player, inputs=file_input, outputs=file_audio_player)
|
845 |
+
|
846 |
+
# 連接 YouTube 處理功能
|
847 |
+
youtube_input.change(
|
848 |
+
fn=process_youtube_url,
|
849 |
+
inputs=youtube_input,
|
850 |
+
outputs=[youtube_audio_player, youtube_download],
|
851 |
+
show_progress=True
|
852 |
+
)
|
853 |
+
|
854 |
+
# 添加一個函數來直接調用 transcribe_audio,不使用 yield
|
855 |
+
def transcribe_audio_with_error_handling(*args):
|
856 |
+
try:
|
857 |
+
# 獲取模型信息
|
858 |
+
selected_model_identifier = args[3] # 第四個參數是 selected_model_identifier
|
859 |
+
model_name_for_display = selected_model_identifier
|
860 |
+
|
861 |
+
# 獲取音頻時長,用於估算處理時間
|
862 |
+
audio_source = None
|
863 |
+
active_tab = args[-1] # 最後一個參數是 active_tab
|
864 |
+
|
865 |
+
if active_tab == "mic" and args[0] is not None:
|
866 |
+
audio_source = args[0]
|
867 |
+
elif active_tab == "file" and args[1] is not None:
|
868 |
+
if isinstance(args[1], list) and len(args[1]) > 0:
|
869 |
+
audio_source = args[1][0]
|
870 |
+
else:
|
871 |
+
audio_source = args[1]
|
872 |
+
elif active_tab == "youtube" and args[2] and args[2].strip():
|
873 |
+
# YouTube 處理較複雜,暫不估算時間
|
874 |
+
pass
|
875 |
+
|
876 |
+
# 檢查音頻文件是否存在
|
877 |
+
if audio_source and os.path.exists(audio_source):
|
878 |
+
print(f"Processing audio file: {audio_source}")
|
879 |
+
|
880 |
+
# 清除之前的輸出,確保結果顯示正確
|
881 |
+
print("\n" + "="*50)
|
882 |
+
print("NEW TRANSCRIPTION PROCESS STARTED")
|
883 |
+
print("="*50 + "\n")
|
884 |
+
|
885 |
+
# 開始計時
|
886 |
+
start_time = time.time()
|
887 |
+
|
888 |
+
# 直接調用 transcribe_audio 函數
|
889 |
+
result = transcribe_audio(*args)
|
890 |
+
|
891 |
+
# 處理完成
|
892 |
+
elapsed_time = time.time() - start_time
|
893 |
+
|
894 |
+
# 處理結果(現在應該是文本字符串)
|
895 |
+
print("DEBUG: Result type:", type(result))
|
896 |
+
print("DEBUG: Final result:", result)
|
897 |
+
|
898 |
+
# 檢查結果是否為字符串
|
899 |
+
if isinstance(result, str):
|
900 |
+
if result.strip() == ".":
|
901 |
+
# 如果結果只是一個點 ".",這是一個已知問題
|
902 |
+
print("DEBUG: Detected dot-only output in handler, fixing...")
|
903 |
+
|
904 |
+
# 從控制台輸出中提取最後一個處理結果
|
905 |
+
# 這是一個臨時解決方案
|
906 |
+
model_info = f"Model / 模型: {model_name_for_display}"
|
907 |
+
inference_time_info = f"Processing Time / 處理時間: {elapsed_time:.2f} seconds / 秒"
|
908 |
+
|
909 |
+
# 嘗試從控制台日誌中提取結果文本
|
910 |
+
# 這裡我們假設結果已經在控制台中打印出來了
|
911 |
+
final_text = f"{model_info}\n{inference_time_info}\n\nResult Text / 結果文字:\n"
|
912 |
+
final_text += "(Please check console for complete transcription / 請查看控制台獲取完整轉錄)"
|
913 |
+
|
914 |
+
print("DEBUG: Created replacement result:", final_text[:100] + "..." if len(final_text) > 100 else final_text)
|
915 |
+
else:
|
916 |
+
# 正常結果,直接使用
|
917 |
+
final_text = result
|
918 |
+
print("DEBUG: Using original result text")
|
919 |
+
else:
|
920 |
+
# 如果結果不是字符串,創建一個新的結果字符串
|
921 |
+
final_text = f"Model / 模型: {model_name_for_display}\n"
|
922 |
+
final_text += f"Processing Time / 處理時間: {elapsed_time:.2f} seconds / 秒\n\n"
|
923 |
+
final_text += "(No text detected in audio / 音頻中未檢測到文字)"
|
924 |
+
print("DEBUG: Created new result for non-string:", final_text[:100] + "..." if len(final_text) > 100 else final_text)
|
925 |
+
|
926 |
+
return final_text, gr.update(), gr.update(), gr.update()
|
927 |
+
except Exception as e:
|
928 |
+
import traceback
|
929 |
+
error_msg = f"Error during processing: {str(e)}\n\n{traceback.format_exc()}"
|
930 |
+
print(error_msg)
|
931 |
+
|
932 |
+
# 返回錯誤訊息,保持其他輸出不變
|
933 |
+
return f"處理過程中發生錯誤 / Error during processing:\n{str(e)}", gr.update(), gr.update(), gr.update()
|
934 |
+
|
935 |
+
# Main submit action - Corrected outputs list
|
936 |
+
submit_button.click(
|
937 |
+
fn=transcribe_audio_with_error_handling,
|
938 |
+
inputs=[mic_input, file_input, youtube_input, model_select, task_input, language_input, timestamp_input, phi4_prompt_input, device_input, output_text, active_tab],
|
939 |
+
outputs=[output_text, mic_input, file_input, youtube_input], # 保持原始輸出
|
940 |
+
show_progress="full" # 顯示完整進度條
|
941 |
+
)
|
942 |
+
|
943 |
+
|
944 |
+
|
945 |
+
# --- Launch App ---
|
946 |
+
if __name__ == "__main__":
|
947 |
+
# 獲取硬體信息
|
948 |
+
cpu_info, gpu_info = get_hardware_info()
|
949 |
+
has_gpu = gpu_info is not None
|
950 |
+
|
951 |
+
print(f"CPU: {cpu_info}")
|
952 |
+
if has_gpu:
|
953 |
+
print(f"GPU: {gpu_info}")
|
954 |
+
else:
|
955 |
+
print("No GPU detected")
|
956 |
+
|
957 |
+
# REMEMBER: Update requirements.txt with accelerate, scipy, torchvision, peft
|
958 |
+
demo.launch(
|
959 |
+
debug=True,
|
960 |
+
max_threads=4, # 減少最大線程數,提高穩定性
|
961 |
+
show_error=True, # 顯示錯誤詳情
|
962 |
+
server_name="127.0.0.1", # 本地運行
|
963 |
+
server_port=7860, # 指定端口
|
964 |
+
quiet=False, # 顯示所有日誌
|
965 |
+
prevent_thread_lock=True # 防止線程鎖定
|
966 |
+
)
|
assets/audio.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b0481618037c60e33386078b873bbc90d4d28af8e07cf11da5aec2031645a49
|
3 |
+
size 1444758
|
assets/sample_audio.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2a7e73ccdc15b77808aa28d8ea7d6c86bae49f9d051d6d5d843faf1fb40c834
|
3 |
+
size 80640
|
pyproject.toml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.pyright]
|
2 |
+
include = ["app.py"]
|
3 |
+
exclude = ["**/node_modules", "**/__pycache__", "**/env", "**/env_new", "**/envsource"]
|
4 |
+
reportMissingImports = false
|
5 |
+
reportGeneralTypeIssues = false
|
6 |
+
reportOptionalMemberAccess = false
|
7 |
+
reportOptionalSubscript = false
|
8 |
+
reportOptionalCall = false
|
9 |
+
reportOptionalIterable = false
|
10 |
+
reportOptionalContextManager = false
|
11 |
+
reportOptionalOperand = false
|
pyrightconfig.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"venvPath": ".",
|
3 |
+
"venv": "env",
|
4 |
+
"reportMissingImports": false
|
5 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Main packages for Hugging Face Spaces
|
2 |
+
gradio>=5.0.0
|
3 |
+
transformers>=4.30.0
|
4 |
+
huggingface-hub>=0.15.0
|
5 |
+
torch>=2.0.0
|
6 |
+
torchvision>=0.15.0
|
7 |
+
torchaudio>=2.0.0
|
8 |
+
|
9 |
+
# Acceleration and optimization
|
10 |
+
accelerate>=1.0.0
|
11 |
+
safetensors>=0.3.0
|
12 |
+
|
13 |
+
# Audio processing
|
14 |
+
yt-dlp>=2023.0.0
|
15 |
+
soundfile>=0.12.0
|
16 |
+
pydub>=0.25.0
|
17 |
+
|
18 |
+
# Data processing
|
19 |
+
numpy>=2.0.0
|
20 |
+
scipy>=1.0.0 # Needed by Phi-4
|
21 |
+
peft>=0.5.0 # Needed by Phi-4
|
22 |
+
backoff>=2.0.0 # Needed by Phi-4
|
requirements_local.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Main packages
|
2 |
+
gradio>=5.0.0
|
3 |
+
transformers>=4.30.0
|
4 |
+
huggingface-hub>=0.15.0
|
5 |
+
|
6 |
+
# PyTorch with CUDA support - these will be installed from the specified URL
|
7 |
+
--extra-index-url https://download.pytorch.org/whl/cu126
|
8 |
+
torch==2.7.0+cu126
|
9 |
+
torchvision==0.22.0+cu126
|
10 |
+
torchaudio==2.7.0+cu126
|
11 |
+
|
12 |
+
# Acceleration and optimization
|
13 |
+
accelerate>=1.0.0
|
14 |
+
safetensors>=0.3.0
|
15 |
+
|
16 |
+
# Audio processing
|
17 |
+
yt-dlp>=2023.0.0
|
18 |
+
soundfile>=0.12.0
|
19 |
+
pydub>=0.25.0
|
20 |
+
|
21 |
+
# Data processing
|
22 |
+
numpy>=2.0.0
|
23 |
+
scipy>=1.0.0 # Needed by Phi-4
|
24 |
+
peft>=0.5.0 # Needed by Phi-4
|
25 |
+
backoff>=2.0.0 # Needed by Phi-4
|
26 |
+
|
27 |
+
# Important dependencies that might need specific versions
|
28 |
+
typing_extensions>=4.10.0
|
29 |
+
filelock>=3.0.0
|
30 |
+
fsspec>=2024.0.0
|