|
import gradio as gr |
|
|
|
import torch |
|
from transformers import pipeline |
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoConfig, AutoModelForCausalLM |
|
import yt_dlp |
|
import tempfile |
|
import os |
|
import shutil |
|
import numpy as np |
|
import time |
|
import soundfile as sf |
|
import traceback |
|
import platform |
|
import re |
|
import subprocess |
|
|
|
|
|
|
|
|
|
def get_hardware_info(): |
|
"""獲取 CPU 和 GPU 信息""" |
|
|
|
cpu_info = "Unknown CPU" |
|
try: |
|
if platform.system() == "Windows": |
|
output = subprocess.check_output("wmic cpu get name", shell=True).decode().strip().split('\n') |
|
if len(output) >= 2: |
|
cpu_info = output[1].strip() |
|
elif platform.system() == "Linux": |
|
with open('/proc/cpuinfo', 'r') as f: |
|
for line in f: |
|
if line.startswith('model name'): |
|
cpu_info = line.split(':')[1].strip() |
|
break |
|
elif platform.system() == "Darwin": |
|
output = subprocess.check_output("sysctl -n machdep.cpu.brand_string", shell=True).decode().strip() |
|
cpu_info = output |
|
except Exception as e: |
|
print(f"Error getting CPU info: {e}") |
|
|
|
|
|
gpu_info = None |
|
if torch.cuda.is_available(): |
|
try: |
|
gpu_info = torch.cuda.get_device_name(0) |
|
|
|
except Exception as e: |
|
print(f"Error getting GPU info: {e}") |
|
|
|
|
|
|
|
|
|
return cpu_info, gpu_info |
|
|
|
|
|
pipe = None |
|
phi4_model = None |
|
phi4_processor = None |
|
current_model_name = None |
|
current_device = "cpu" |
|
|
|
|
|
PHI4_MODEL_ID = "microsoft/Phi-4-multimodal-instruct" |
|
MERALION_MODEL_ID = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION" |
|
SEALLM_MODEL_ID = "SeaLLMs/SeaLLMs-Audio-7B" |
|
|
|
MODEL_DATA = [ |
|
{"id": "openai/whisper-tiny", "params": "~39M", "size": "151 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"}, |
|
{"id": "openai/whisper-base", "params": "~74M", "size": "290 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"}, |
|
{"id": "openai/whisper-small", "params": "~244M", "size": "967 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"}, |
|
{"id": "openai/whisper-medium", "params": "~769M", "size": "3.06 GB", "status_en": "Available (CPU Slow)", "status_zh": "可用 (CPU 慢)", "type": "whisper"}, |
|
{"id": "openai/whisper-large", "params": "~1.55B", "size": "6.17 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"}, |
|
{"id": "openai/whisper-large-v2", "params": "~1.55B", "size": "6.17 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"}, |
|
{"id": "openai/whisper-large-v3", "params": "~1.55B", "size": "3.09 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"}, |
|
{"id": "openai/whisper-large-v3-turbo", "params": "~809M", "size": "1.62 GB", "status_en": "Available (Optimized, CPU Slow)", "status_zh": "可用 (優化, CPU 慢)", "type": "whisper"}, |
|
{"id": PHI4_MODEL_ID, "params": "~5.57B", "size": "11.15 GB", "status_en": "Multimodal (Need Trust, High RAM)", "status_zh": "多模態 (需信任,高RAM)", "type": "phi4"}, |
|
|
|
|
|
] |
|
MODEL_INFO_DICT = {m['id']: m for m in MODEL_DATA} |
|
MODEL_CHOICES_WITH_PARAMS = [ |
|
(f"{m['id'].split('/')[-1]} ({m['params']}, {m['size']}) - {m['status_en']} / {m['status_zh']}", m['id']) |
|
for m in MODEL_DATA |
|
] |
|
DEFAULT_MODEL = "openai/whisper-tiny" |
|
|
|
|
|
BILINGUAL_LANGUAGES_DICT = { |
|
"auto": "Auto-detect / 自動偵測", "en": "English / 英文", "zh": "Chinese / 中文", "de": "German / 德文", "es": "Spanish / 西班牙文", |
|
"ru": "Russian / 俄文", "ko": "Korean / 韓文", "fr": "French / 法文", "ja": "Japanese / 日文", "pt": "Portuguese / 葡萄牙文", "tr": "Turkish / 土耳其文", |
|
"pl": "Polish / 波蘭文", "ca": "Catalan / 加泰隆尼亞文", "nl": "Dutch / 荷蘭文", "ar": "Arabic / 阿拉伯文", "sv": "Swedish / 瑞典文", "it": "Italian / 義大利文", |
|
"id": "Indonesian / 印尼文", "hi": "Hindi / 印地文", "fi": "Finnish / 芬蘭文", "vi": "Vietnamese / 越南文", "he": "Hebrew / 希伯來文", "uk": "Ukrainian / 烏克蘭文", |
|
"el": "Greek / 希臘文", "ms": "Malay / 馬來文", "cs": "Czech / 捷克文", "ro": "Romanian / 羅馬尼亞文", "da": "Danish / 丹麥文", "hu": "Hungarian / 匈牙利文", |
|
"ta": "Tamil / 坦米爾文", "no": "Norwegian / 挪威文", "th": "Thai / 泰文", "ur": "Urdu / 烏爾都文", "hr": "Croatian / 克羅埃西亞文", "bg": "Bulgarian / 保加利亞文", |
|
"lt": "Lithuanian / 立陶宛文", "la": "Latin / 拉丁文", "mi": "Maori / 毛利文", "ml": "Malayalam / 馬拉雅拉姆文", "cy": "Welsh / 威爾斯文", "sk": "Slovak / 斯洛伐克文", |
|
"te": "Telugu / 泰盧固文", "fa": "Persian / 波斯文", "lv": "Latvian / 拉脫維亞文", "bn": "Bengali / 孟加拉文", "sr": "Serbian / 塞爾維亞文", "az": "Azerbaijani / 亞塞拜然文", |
|
"sl": "Slovenian / 斯洛維尼亞文", "kn": "Kannada / 坎那達文", "et": "Estonian / 愛沙尼亞文", "mk": "Macedonian / 馬其頓文", "br": "Breton / 布列塔尼文", |
|
"eu": "Basque / 巴斯克文", "is": "Icelandic / 冰島文", "hy": "Armenian / 亞美尼亞文", "ne": "Nepali / 尼泊爾文", "mn": "Mongolian / 蒙古文", "bs": "Bosnian / 波士尼亞文", |
|
"kk": "Kazakh / 哈薩克文", "sq": "Albanian / 阿爾巴尼亞文", "sw": "Swahili / 史瓦希里文", "gl": "Galician / 加利西亞文", "mr": "Marathi / 馬拉地文", "pa": "Punjabi / 旁遮普文", |
|
"si": "Sinhala / 僧伽羅文", "km": "Khmer / 高棉文", "sn": "Shona / 修納文", "yo": "Yoruba / 約魯巴文", "so": "Somali / 索馬利文", "af": "Afrikaans / 南非荷蘭文", |
|
"oc": "Occitan / 奧克西坦文", "ka": "Georgian / 喬治亞文", "be": "Belarusian / 白俄羅斯文", "tg": "Tajik / 塔吉克文", "sd": "Sindhi / 信德文", "gu": "Gujarati / 古吉拉特文", |
|
"am": "Amharic / 安哈拉文", "yi": "Yiddish / 意第緒文", "lo": "Lao / 寮文", "uz": "Uzbek / 烏茲別克文", "fo": "Faroese / 法羅文", "ht": "Haitian Creole / 海地克里奧爾文", |
|
"ps": "Pashto / 普什圖文", "tk": "Turkmen / 土庫曼文", "nn": "Nynorsk / 新挪威文", "mt": "Maltese / 馬爾他文", "sa": "Sanskrit / 梵文", "lb": "Luxembourgish / 盧森堡文", |
|
"my": "Myanmar / 緬甸文", "bo": "Tibetan / 藏文", "tl": "Tagalog / 他加祿文", "mg": "Malagasy / 馬達加斯加文", "as": "Assamese / 阿薩姆文", "tt": "Tatar / 韃靼文", |
|
"haw": "Hawaiian / 夏威夷文", "ln": "Lingala / 林加拉文", "ha": "Hausa / 豪沙文", "ba": "Bashkir / 巴什基爾文", "jw": "Javanese / 爪哇文", "su": "Sundanese / 巽他文", |
|
"yue": "Cantonese / 粵語", |
|
} |
|
WHISPER_LANGUAGES_LIST = [] |
|
WHISPER_LANGUAGES_LIST.append((BILINGUAL_LANGUAGES_DICT["auto"], "auto")) |
|
def get_english_name(display_name_tuple): return display_name_tuple[0].split('/')[0].strip() |
|
sorted_languages = sorted( [(display_name, code) for code, display_name in BILINGUAL_LANGUAGES_DICT.items() if code != "auto"], key=get_english_name ) |
|
WHISPER_LANGUAGES_LIST.extend(sorted_languages) |
|
PHI4_AUDIO_LANG_CODES = ["auto", "en", "zh", "de", "fr", "it", "ja", "es", "pt"] |
|
PHI4_LANGUAGES_LIST = [(BILINGUAL_LANGUAGES_DICT.get(code, code), code) for code in PHI4_AUDIO_LANG_CODES] |
|
|
|
|
|
MIC_PROMPT = """**Try Reading / 試著朗讀:** |
|
"Success is stumbling from failure to failure with no loss of enthusiasm." - Winston Churchill |
|
「成功是在一次又一次失敗中,依然熱情不減地前行。」 - 溫斯頓・邱吉爾""" |
|
|
|
|
|
|
|
|
|
def format_timestamp(seconds): |
|
if seconds is None: return "N/A" |
|
milliseconds = round(seconds * 1000) |
|
seconds_int = int(milliseconds // 1000) |
|
milliseconds_rem = milliseconds % 1000 |
|
minutes = seconds_int // 60 |
|
seconds_rem = seconds_int % 60 |
|
hours = minutes // 60 |
|
minutes_rem = minutes % 60 |
|
return f"{hours:01d}:{minutes_rem:02d}:{seconds_rem:02d}.{milliseconds_rem:03d}" |
|
|
|
|
|
def update_download_file(filepath): |
|
"""當有音訊檔案時更新下載檔案""" |
|
if filepath and os.path.exists(filepath): |
|
return filepath |
|
return None |
|
|
|
|
|
def process_youtube_url(youtube_url): |
|
"""處理 YouTube URL,下載音訊並返回播放器和下載按鈕的更新""" |
|
if not youtube_url or not youtube_url.strip(): |
|
return gr.update(visible=False, value=None), gr.update(visible=False, value=None) |
|
|
|
|
|
import os |
|
is_spaces = os.environ.get("SPACE_ID") is not None |
|
|
|
try: |
|
|
|
print(f"Processing YouTube URL: {youtube_url}") |
|
|
|
if is_spaces: |
|
|
|
print("Warning: YouTube download is not supported in Hugging Face Spaces.") |
|
raise gr.Error("YouTube 下載在 Hugging Face Spaces 中不可用。由於安全限制,Spaces 環境無法通過 YouTube 的機器人驗證。請在本地環境中使用此功能。\n\nYouTube download is not available in Hugging Face Spaces. Due to security restrictions, Spaces environment cannot pass YouTube's bot verification. Please use this feature in a local environment.") |
|
|
|
|
|
audio_path, temp_dir, duration = download_youtube_audio(youtube_url) |
|
|
|
if audio_path and os.path.exists(audio_path): |
|
|
|
return gr.update(visible=True, value=audio_path), gr.update(visible=True, value=audio_path) |
|
else: |
|
return gr.update(visible=False, value=None), gr.update(visible=False, value=None) |
|
except Exception as e: |
|
print(f"Error processing YouTube URL: {e}") |
|
return gr.update(visible=False, value=None), gr.update(visible=False, value=None) |
|
|
|
|
|
def download_youtube_audio(url): |
|
|
|
download_dir = os.path.join(tempfile.gettempdir(), "youtube_downloads") |
|
os.makedirs(download_dir, exist_ok=True) |
|
|
|
|
|
video_id = url.split("v=")[-1].split("&")[0] if "v=" in url else str(int(time.time())) |
|
filename = f"youtube_{video_id}_{int(time.time())}" |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
downloaded_path = None |
|
try: |
|
temp_filepath_tmpl = os.path.join(download_dir, f"{filename}.%(ext)s") |
|
ydl_opts = { |
|
'format': 'bestaudio/best', |
|
'outtmpl': temp_filepath_tmpl, |
|
'noplaylist': True, |
|
'quiet': True, |
|
'postprocessors': [{'key': 'FFmpegExtractAudio','preferredcodec': 'mp3','preferredquality': '192',}], |
|
'ffmpeg_location': shutil.which("ffmpeg"), |
|
} |
|
if not ydl_opts['ffmpeg_location']: print("Warning: ffmpeg not found... / 警告:找不到 ffmpeg...") |
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
info_dict = ydl.extract_info(url, download=True) |
|
duration = info_dict.get('duration') |
|
title = info_dict.get('title', 'unknown') |
|
|
|
final_filepath = ydl.prepare_filename(info_dict) |
|
if not final_filepath.endswith('.mp3'): |
|
base_name = final_filepath.rsplit('.', 1)[0] |
|
final_filepath = base_name + '.mp3' |
|
|
|
if os.path.exists(final_filepath): |
|
downloaded_path = final_filepath |
|
print(f"YouTube audio downloaded: {downloaded_path}") |
|
print(f"Title: {title}, Duration: {duration}s") |
|
else: |
|
potential_files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.startswith(filename) and f.endswith(".mp3")] |
|
if potential_files: |
|
downloaded_path = potential_files[0] |
|
print(f"Warning: Could not find expected MP3, using fallback: {downloaded_path}") |
|
duration = None |
|
else: |
|
raise FileNotFoundError(f"Audio file not found after download in {download_dir}") |
|
|
|
return downloaded_path, temp_dir, duration |
|
except Exception as e: |
|
print(f"Error processing YouTube URL: {e}") |
|
if temp_dir and os.path.exists(temp_dir): |
|
try: shutil.rmtree(temp_dir) |
|
except Exception as cleanup_e: print(f"Error cleaning temp directory {temp_dir}: {cleanup_e}") |
|
return None, None, None |
|
|
|
|
|
def load_asr_pipeline(model_id): |
|
global pipe, phi4_model, phi4_processor, current_device |
|
print(f"DEBUG: Loading ASR pipeline for {model_id} on device: {current_device}") |
|
trust_code = model_id in [MERALION_MODEL_ID, SEALLM_MODEL_ID] |
|
if trust_code: print(f"DEBUG: Setting trust_remote_code=True for pipeline model {model_id}") |
|
try: |
|
phi4_model = None |
|
phi4_processor = None |
|
|
|
|
|
if current_device == "gpu": |
|
|
|
if torch.cuda.is_available(): |
|
try: |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=model_id, |
|
trust_remote_code=trust_code, |
|
device="cuda" |
|
) |
|
|
|
print(f"DEBUG: Using GPU (CUDA) for ASR pipeline. Available GPU: {torch.cuda.get_device_name(0)}") |
|
except Exception as e: |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=model_id, |
|
trust_remote_code=trust_code, |
|
device=0 |
|
) |
|
print(f"DEBUG: Using GPU (device=0) for ASR pipeline. Reason for device_map failure: {str(e)}") |
|
else: |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=model_id, |
|
trust_remote_code=trust_code, |
|
device="cpu" |
|
) |
|
print("WARNING: GPU selected but CUDA is not available. Falling back to CPU.") |
|
else: |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=model_id, |
|
trust_remote_code=trust_code, |
|
device="cpu" |
|
) |
|
|
|
print("DEBUG: Using CPU for ASR pipeline.") |
|
print(f"DEBUG: Model loaded on device: {pipe.device}") |
|
return pipe |
|
except Exception as e: |
|
print(f"Error loading ASR pipeline for {model_id}:") |
|
traceback.print_exc() |
|
raise e |
|
|
|
|
|
def load_phi4_model(model_id): |
|
global pipe, phi4_model, phi4_processor, current_device |
|
print(f"DEBUG: Loading Phi-4 model {model_id} on device: {current_device}") |
|
try: |
|
pipe = None |
|
phi4_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) |
|
|
|
|
|
if current_device == "gpu": |
|
|
|
if torch.cuda.is_available(): |
|
try: |
|
|
|
phi4_model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16, |
|
_attn_implementation="eager", |
|
) |
|
phi4_model = phi4_model.to("cuda") |
|
print(f"DEBUG: Using GPU (CUDA) for Phi-4. Available GPU: {torch.cuda.get_device_name(0)}") |
|
except Exception as e: |
|
|
|
try: |
|
phi4_model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16, |
|
_attn_implementation="eager", |
|
) |
|
phi4_model = phi4_model.to("cuda:0") |
|
print(f"DEBUG: Using GPU (device=0) for Phi-4. Reason for first attempt failure: {str(e)}") |
|
except Exception as e2: |
|
|
|
phi4_model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float32, |
|
_attn_implementation="eager", |
|
) |
|
phi4_model = phi4_model.to("cpu") |
|
print(f"WARNING: Failed to use GPU for Phi-4, falling back to CPU. Error: {str(e2)}") |
|
else: |
|
|
|
phi4_model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float32, |
|
_attn_implementation="eager", |
|
) |
|
phi4_model = phi4_model.to("cpu") |
|
print("WARNING: GPU selected but CUDA is not available. Falling back to CPU for Phi-4.") |
|
else: |
|
|
|
phi4_model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float32, |
|
_attn_implementation="eager", |
|
) |
|
phi4_model = phi4_model.to("cpu") |
|
print("DEBUG: Using CPU for Phi-4.") |
|
|
|
print(f"DEBUG: Phi-4 model loaded on device: {next(phi4_model.parameters()).device}") |
|
return phi4_model, phi4_processor |
|
except Exception as e: |
|
print(f"Error loading Phi-4 model {model_id}:") |
|
traceback.print_exc() |
|
if "scipy" in str(e) or "torchvision" in str(e) or "peft" in str(e): |
|
missing_pkg = "scipy" if "scipy" in str(e) else "torchvision" if "torchvision" in str(e) else "peft" |
|
raise type(e)(f"{e}. Please ensure '{missing_pkg}' is in requirements.txt") from e |
|
else: raise e |
|
|
|
|
|
def transcribe_audio(mic_input, file_input, youtube_url, selected_model_identifier, |
|
task, language, return_timestamps, |
|
phi4_prompt_text, device_choice, |
|
previous_output_text, active_tab): |
|
global pipe, phi4_model, phi4_processor, current_model_name, current_device |
|
audio_source = None |
|
source_type_en = "" |
|
source_type_zh = "" |
|
temp_dir_to_clean = None |
|
audio_duration = None |
|
model_name_for_display = selected_model_identifier |
|
model_load_time = 0.0 |
|
inference_time = 0.0 |
|
model_type = MODEL_INFO_DICT.get(selected_model_identifier, {}).get("type", "other") |
|
output_text_accumulated = previous_output_text if previous_output_text else "" |
|
status_update_prefix = output_text_accumulated + ("\n\n---\n\n" if output_text_accumulated else "") |
|
final_output_text = output_text_accumulated |
|
|
|
|
|
if device_choice != current_device: |
|
current_device = device_choice |
|
print(f"DEBUG: Device changed to {current_device}") |
|
|
|
pipe = None |
|
phi4_model = None |
|
phi4_processor = None |
|
current_model_name = None |
|
|
|
|
|
model_changed = selected_model_identifier != current_model_name |
|
model_needs_load = (model_type == "phi4" and phi4_model is None) or (model_type != "phi4" and pipe is None) |
|
|
|
if model_changed or model_needs_load: |
|
warning_message = "" |
|
|
|
if selected_model_identifier in [PHI4_MODEL_ID, MERALION_MODEL_ID, SEALLM_MODEL_ID]: |
|
warning_message += f"Warning: Model {selected_model_identifier} requires executing remote code.\n警告: 模型 {selected_model_identifier} 需要執行遠端程式碼。\n" |
|
if "seallms" in selected_model_identifier.lower() or "meralion" in selected_model_identifier.lower(): warning_message += f"Warning: Model {selected_model_identifier} likely requires >16GB RAM.\n警告: 模型 {selected_model_identifier} 可能需要 >16GB RAM。\n" |
|
if model_type == "phi4": warning_message += f"Warning: Phi-4 uses a different process.\n警告: Phi-4 使用不同處理流程。\n" |
|
print(f"Attempting to load model / 嘗試載入模型: {selected_model_identifier} (Type / 類型: {model_type})") |
|
status_update_str = warning_message + f"Loading model / 正在載入模型: {selected_model_identifier}..." |
|
|
|
output_text_accumulated = status_update_prefix + status_update_str |
|
|
|
load_start_time = time.monotonic() |
|
try: |
|
if model_type == "phi4": |
|
phi4_model, phi4_processor = load_phi4_model(selected_model_identifier) |
|
pipe = None |
|
else: |
|
pipe = load_asr_pipeline(selected_model_identifier) |
|
phi4_model = None |
|
phi4_processor = None |
|
load_end_time = time.monotonic() |
|
model_load_time = load_end_time - load_start_time |
|
current_model_name = selected_model_identifier |
|
model_name_for_display = current_model_name |
|
print(f"Model {current_model_name} loaded successfully ({model_load_time:.2f}s). / 模型 {current_model_name} 載入成功 ({model_load_time:.2f} 秒).") |
|
status_update_str = warning_message + f"Model {current_model_name} loaded successfully / 載入成功 ({model_load_time:.2f}s)." |
|
|
|
output_text_accumulated = status_update_prefix + status_update_str |
|
except Exception as e: |
|
load_end_time = time.monotonic() |
|
model_load_time = load_end_time - load_start_time |
|
print(f"Failed to load model {selected_model_identifier} ({model_load_time:.2f}s). / 載入模型 {selected_model_identifier} 失敗 ({model_load_time:.2f} 秒).") |
|
error_msg = f"Error: Failed to load model {selected_model_identifier}:\n錯誤: 載入模型 {selected_model_identifier} 失敗:\n{e}\n({model_load_time:.2f}s)" |
|
if "requires `accelerate`" in str(e): error_msg += "\n**Missing 'accelerate'. Please install. / 缺少 'accelerate',請安裝.**" |
|
if isinstance(e, (MemoryError, RuntimeError)) and "out of memory" in str(e).lower(): error_msg += "\n**Out of Memory. Try a smaller model. / 記憶體不足,請嘗試較小模型.**" |
|
if "trust_remote_code=True" in str(e): error_msg += "\n**Requires trusting remote code. Model might be unsafe. / 需要信任遠端代碼,模型可能不安全.**" |
|
if "scipy" in str(e) or "torchvision" in str(e) or "peft" in str(e): |
|
missing_pkg = "scipy" if "scipy" in str(e) else "torchvision" if "torchvision" in str(e) else "peft" |
|
error_msg += f"\n**Missing '{missing_pkg}'. Please install. / 缺少 '{missing_pkg}',請安裝.**" |
|
status_update_str = warning_message + error_msg |
|
pipe = None |
|
phi4_model = None |
|
phi4_processor = None |
|
current_model_name = None |
|
|
|
output_text_accumulated = status_update_prefix + status_update_str |
|
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) |
|
|
|
|
|
if (model_type == "phi4" and phi4_model is None) or (model_type != "phi4" and pipe is None): |
|
output_text_accumulated = status_update_prefix + "Error: Cannot use model. / 錯誤: 無法使用模型." |
|
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) |
|
|
|
|
|
|
|
print(f"DEBUG: Active tab is {active_tab}") |
|
|
|
if active_tab == "mic" and mic_input is not None: |
|
audio_source = mic_input |
|
source_type_en = "Microphone" |
|
source_type_zh = "麥克風" |
|
elif active_tab == "file" and file_input is not None: |
|
|
|
if isinstance(file_input, list) and len(file_input) > 0: |
|
|
|
audio_source = file_input[0] |
|
else: |
|
|
|
audio_source = file_input |
|
source_type_en = "File Upload" |
|
source_type_zh = "檔案上傳" |
|
elif active_tab == "youtube" and youtube_url and youtube_url.strip(): |
|
source_type_en = "YouTube" |
|
source_type_zh = "YouTube" |
|
status_update_str = f"Downloading YouTube Audio / 正在下載 YouTube 音訊..." |
|
output_text_accumulated = status_update_prefix + status_update_str |
|
|
|
|
|
is_spaces = os.environ.get("SPACE_ID") is not None |
|
if is_spaces: |
|
output_text_accumulated = status_update_prefix + "Error: YouTube download is not supported in Hugging Face Spaces. / 錯誤:YouTube 下載在 Hugging Face Spaces 中不可用。" |
|
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) |
|
|
|
|
|
audio_path, temp_dir_to_clean, duration_yt = download_youtube_audio(youtube_url) |
|
|
|
if audio_path and os.path.exists(audio_path): |
|
audio_source = audio_path |
|
audio_duration = duration_yt |
|
else: |
|
output_text_accumulated = status_update_prefix + "Error: Failed to download YouTube audio. / 錯誤:無法下載 YouTube 音訊。" |
|
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) |
|
else: |
|
|
|
return (previous_output_text, gr.update(), gr.update(), gr.update()) |
|
|
|
if audio_source is None: |
|
output_text_accumulated = status_update_prefix + f"Error: No audio file provided. / 錯誤:未提供音訊檔案." |
|
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) |
|
|
|
|
|
if not os.path.exists(audio_source): |
|
output_text_accumulated = status_update_prefix + f"Error: Audio file not found '{audio_source}'. / 錯誤:找不到音訊檔案 '{audio_source}'." |
|
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) |
|
|
|
|
|
valid_audio_extensions = ['.wav', '.mp3', '.ogg', '.flac', '.m4a', '.aac'] |
|
file_ext = os.path.splitext(audio_source)[1].lower() |
|
if file_ext not in valid_audio_extensions: |
|
output_text_accumulated = status_update_prefix + f"Error: Invalid audio file format '{file_ext}'. / 錯誤:無效的音訊檔案格式 '{file_ext}'." |
|
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) |
|
|
|
if audio_duration is None: |
|
try: |
|
|
|
if file_ext == '.wav': |
|
|
|
import wave |
|
try: |
|
with wave.open(audio_source, 'rb') as wf: |
|
frames = wf.getnframes() |
|
rate = wf.getframerate() |
|
audio_duration = frames / float(rate) |
|
print(f"Got audio duration from wave module / 從 wave 模塊獲取音檔時長: {audio_duration:.2f}s") |
|
except Exception as wave_err: |
|
print(f"Could not get audio duration from wave module / 無法從 wave 模塊獲取音檔時長: {wave_err}") |
|
|
|
info = sf.info(audio_source) |
|
audio_duration = info.duration |
|
print(f"Got audio duration from soundfile / 從 soundfile 獲取音檔時長: {audio_duration:.2f}s") |
|
else: |
|
|
|
info = sf.info(audio_source) |
|
audio_duration = info.duration |
|
print(f"Got audio duration from soundfile / 從 soundfile 獲取音檔時長: {audio_duration:.2f}s") |
|
except Exception as e: |
|
print(f"Could not get audio duration / 無法獲取音檔時長: {e}") |
|
|
|
audio_duration = 0.0 |
|
print(f"Using default audio duration / 使用默認音檔時長: {audio_duration:.2f}s") |
|
|
|
print(f"Processing with {current_model_name} from [{source_type_en} / {source_type_zh}]: {audio_source}") |
|
print(f"Options: Task='{task}', Language(Source)='{language}', Timestamps='{return_timestamps}'") |
|
if model_type == "phi4": print(f"Phi-4 Prompt: '{phi4_prompt_text}'") |
|
|
|
status_update_str = f"Processing, please wait... / 正在處理,請稍候...\n(Model / 模型: {model_name_for_display})" |
|
output_text_accumulated = status_update_prefix + status_update_str |
|
|
|
|
|
inference_start_time = time.monotonic() |
|
current_run_output = "" |
|
timing_info_str = "" |
|
try: |
|
if model_type == "phi4": |
|
print("DEBUG: Processing with Phi-4...") |
|
if not phi4_model or not phi4_processor: raise ValueError("Phi-4 model/processor not loaded / Phi-4 模型/處理器未載入") |
|
if not phi4_prompt_text: raise ValueError("Phi-4 requires a prompt text / Phi-4 需要提示文字") |
|
user_prompt_tag='<|user|>' |
|
assistant_prompt_tag='<|assistant|>' |
|
end_tag='<|end|>' |
|
prompt = f"{user_prompt_tag}<|audio_1|>{phi4_prompt_text}{end_tag}{assistant_prompt_tag}" |
|
audio_data, samplerate = sf.read(audio_source) |
|
inputs = phi4_processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(phi4_model.device) |
|
with torch.no_grad(): generate_ids = phi4_model.generate(**inputs, max_new_tokens=500, num_logits_to_keep=0) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
result_text = phi4_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
|
current_run_output = result_text.strip() |
|
return_timestamps = False |
|
else: |
|
print("DEBUG: Processing with ASR pipeline...") |
|
if not pipe: raise ValueError("ASR pipeline not loaded / ASR pipeline 未載入") |
|
generate_kwargs_pipe = {"task": task} |
|
|
|
|
|
if task == "transcribe": |
|
|
|
if language != "auto": |
|
generate_kwargs_pipe["language"] = language |
|
print(f"DEBUG: Setting source language to {language} for transcription") |
|
else: |
|
|
|
|
|
print(f"DEBUG: Translation target language is {language}, but Whisper only supports English as target") |
|
|
|
|
|
pipeline_kwargs = { |
|
"chunk_length_s": 30, |
|
"batch_size": 1, |
|
"return_timestamps": "chunks" if return_timestamps else False, |
|
"generate_kwargs": generate_kwargs_pipe |
|
} |
|
|
|
|
|
|
|
|
|
result = pipe(audio_source, **pipeline_kwargs) |
|
|
|
print("DEBUG: pipe() call finished.") |
|
print("DEBUG: Raw result type:", type(result)) |
|
print("DEBUG: Raw result content:", result) |
|
|
|
|
|
if return_timestamps and isinstance(result, dict) and "chunks" in result: |
|
formatted_chunks = [f"[{format_timestamp(chunk.get('timestamp', (None,))[0])} -> {format_timestamp(chunk.get('timestamp', (None, None))[1])}] {chunk.get('text', '').strip()}" for chunk in result["chunks"]] |
|
current_run_output = "\n".join(formatted_chunks).strip() |
|
elif isinstance(result, dict) and "text" in result: |
|
current_run_output = result["text"].strip() |
|
elif isinstance(result, str): |
|
current_run_output = result.strip() |
|
elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict) and 'generated_text' in result[0]: |
|
current_run_output = result[0]['generated_text'].strip() |
|
else: |
|
current_run_output = f"(Unrecognized result format / 無法識別的結果格式: {type(result)})" |
|
|
|
print("DEBUG: Processed result:", current_run_output[:100] + "..." if len(current_run_output) > 100 else current_run_output) |
|
|
|
inference_end_time = time.monotonic() |
|
inference_time = inference_end_time - inference_start_time |
|
if not current_run_output: current_run_output = "(Audio empty or unrecognizable / 音檔空白或無法辨識)" |
|
|
|
|
|
timing_info_str = f"Model / 模型: {model_name_for_display}\n" |
|
if model_load_time > 0: timing_info_str += f"Model Load Time / 模型載入時間: {model_load_time:.2f} seconds / 秒\n" |
|
timing_info_str += f"Inference Time / 推論時間: {inference_time:.2f} seconds / 秒\n" |
|
relative_speed_str = "(relative time unavailable / 無法計算相對時間)" |
|
if audio_duration is not None and audio_duration > 0: |
|
relative_speed = inference_time / audio_duration |
|
|
|
relative_speed_str = f"audio duration / 音檔長度 x {relative_speed:.2f}" |
|
timing_info_str += f"audio duration / 音檔時長: {audio_duration:.2f} seconds / 秒\n" |
|
timing_info_str += f"relative speed / 相對速度: {relative_speed_str}" |
|
|
|
print(f"Processing finished. / 處理完成。") |
|
print(timing_info_str.replace('\n', ' | ')) |
|
print(f"Result Text / 結果文字:\n{current_run_output}") |
|
|
|
|
|
if not current_run_output or current_run_output.strip() == "": |
|
current_run_output = "No text detected in audio / 音頻中未檢測到文字" |
|
|
|
|
|
final_output_text = "" |
|
if status_update_prefix and status_update_prefix.strip(): |
|
final_output_text += status_update_prefix + "\n" |
|
|
|
|
|
final_output_text += timing_info_str + "\n\n" |
|
|
|
|
|
final_output_text += "Result Text / 結果文字:\n" + current_run_output |
|
|
|
|
|
final_output_text = final_output_text.strip() |
|
if final_output_text == "." or not final_output_text: |
|
final_output_text = timing_info_str + "\n\nResult Text / 結果文字:\n" + current_run_output |
|
|
|
|
|
|
|
if final_output_text == ".": |
|
print("DEBUG: Detected dot-only output, fixing...") |
|
|
|
fixed_output = f"{timing_info_str}\n\nResult Text / 結果文字:\n{current_run_output}" |
|
return fixed_output |
|
return final_output_text |
|
|
|
except Exception as e: |
|
inference_end_time = time.monotonic() |
|
inference_time = inference_end_time - inference_start_time |
|
print(f"DEBUG: Exception occurred during processing / 處理過程中發生錯誤:") |
|
traceback.print_exc() |
|
error_message = f"Processing Failed / 處理失敗:\n{e}" |
|
final_output_text = (status_update_prefix + error_message).strip() |
|
timing_info_str = f"Model / 模型: {model_name_for_display}\n" |
|
if model_load_time > 0: timing_info_str += f"Model Load Time / 模型載入時間: {model_load_time:.2f} seconds / 秒\n" |
|
timing_info_str += f"Inference Time (until error) / 推論時間 (至錯誤): {inference_time:.2f} seconds / 秒\n" |
|
timing_info_str += "Processing Failed / 處理失敗" |
|
final_output_text += "\n\n" + timing_info_str |
|
if isinstance(e, (MemoryError, RuntimeError)) and "out of memory" in str(e).lower(): final_output_text += "\n\nOut of Memory, try smaller model. / 記憶體不足,請用小模型." |
|
|
|
finally: |
|
if temp_dir_to_clean: |
|
print(f"Cleaning YouTube temp files / 清理 YouTube 暫存: {temp_dir_to_clean}") |
|
|
|
try: |
|
shutil.rmtree(temp_dir_to_clean) |
|
except Exception as e: |
|
print(f"Failed to clean temp files / 清理暫存失敗: {e}") |
|
|
|
print("DEBUG: Returning final result tuple...") |
|
|
|
|
|
|
|
if isinstance(final_output_text, dict): |
|
return final_output_text |
|
else: |
|
return (final_output_text, gr.update(), gr.update(), gr.update()) |
|
|
|
|
|
|
|
|
|
def update_file_audio_player(file_path): |
|
if file_path is None: |
|
return gr.update(value=None, visible=False) |
|
|
|
|
|
if isinstance(file_path, list) and len(file_path) > 0: |
|
file_path = file_path[0] |
|
|
|
|
|
if not os.path.exists(file_path): |
|
return gr.update(value=None, visible=False) |
|
|
|
|
|
valid_audio_extensions = ['.wav', '.mp3', '.ogg', '.flac', '.m4a', '.aac'] |
|
file_ext = os.path.splitext(file_path)[1].lower() |
|
if file_ext not in valid_audio_extensions: |
|
return gr.update(value=None, visible=False) |
|
|
|
|
|
return gr.update(value=file_path, visible=True) |
|
|
|
def update_task_choices(selected_model_id): |
|
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other") |
|
if model_type == "whisper": new_choices = [ ("Transcribe / 轉錄", "transcribe"), ("Translate (Whisper only to English) / 翻譯 (Whisper 僅支援轉譯至英文)", "translate") ] |
|
else: new_choices = [ ("Transcribe / 轉錄", "transcribe"), ("Translate / 轉譯", "translate") ] |
|
return gr.update(choices=new_choices) |
|
|
|
def update_phi4_prompt_ui(selected_model_id, task, language_code): |
|
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other") |
|
is_phi4 = model_type == "phi4" |
|
prompt_text = "" |
|
if is_phi4: |
|
if task == "transcribe": |
|
if language_code == "auto": |
|
prompt_text = "Transcribe the audio to text." |
|
else: |
|
lang_display_name = BILINGUAL_LANGUAGES_DICT.get(language_code, language_code) |
|
lang_english_name = lang_display_name.split('/')[0].strip() |
|
prompt_text = f"Transcribe the audio in {lang_english_name}." |
|
elif task == "translate": |
|
|
|
lang_display_name = BILINGUAL_LANGUAGES_DICT.get(language_code, language_code) |
|
lang_english_name = lang_display_name.split('/')[0].strip() |
|
if language_code == "auto" or language_code == "en": |
|
|
|
prompt_text = "Translate the audio to text." |
|
else: |
|
|
|
prompt_text = f"Detect the language in the audio and translate it to {lang_english_name}." |
|
|
|
return gr.update(visible=is_phi4, value=prompt_text) |
|
|
|
def update_language_choices(selected_model_id): |
|
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other") |
|
if model_type == "phi4": return gr.update(choices=PHI4_LANGUAGES_LIST, value="auto") |
|
else: return gr.update(choices=WHISPER_LANGUAGES_LIST, value="auto") |
|
|
|
def update_timestamp_visibility(selected_model_id): |
|
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other") |
|
print(f"DEBUG: Updating timestamp visibility for {selected_model_id}. Type: {model_type}. Visible: {model_type != 'phi4'}") |
|
return gr.update(visible=(model_type != "phi4")) |
|
|
|
def update_language_ui(model_id, task): |
|
"""根據模型和任務更新語言選擇器的標籤和可見性""" |
|
model_type = MODEL_INFO_DICT.get(model_id, {}).get("type", "other") |
|
|
|
|
|
if model_type == "whisper" and task == "translate": |
|
return gr.update(visible=False, label="Target Language / 目標語言") |
|
|
|
|
|
if task == "transcribe": |
|
return gr.update(visible=True, label="Source Language / 來源語言") |
|
else: |
|
return gr.update(visible=True, label="Target Language / 目標語言") |
|
|
|
|
|
|
|
compact_css = """ |
|
.tabitem { margin: 0rem !important; padding: 0rem !important;} |
|
.compact-file > div { min-height: unset !important; } |
|
.warning-box { |
|
background-color: #fff3cd; |
|
color: #856404; |
|
padding: 10px; |
|
border-radius: 5px; |
|
border-left: 5px solid #ffc107; |
|
margin-bottom: 15px; |
|
} |
|
.info-box { |
|
background-color: #d1ecf1; |
|
color: #0c5460; |
|
padding: 10px; |
|
border-radius: 5px; |
|
border-left: 5px solid #17a2b8; |
|
margin-bottom: 15px; |
|
} |
|
""" |
|
|
|
|
|
|
|
with gr.Blocks(css=compact_css, theme=gr.themes.Default(spacing_size=gr.themes.sizes.spacing_sm, text_size=gr.themes.sizes.text_sm)) as demo: |
|
|
|
gr.Markdown("# Automatic Speech Recognition(ASR) & Speech to Text(STT) / 語音辨識、語音轉文字 🔊🔄📝\nUse AI models to transcribe or translate speech from microphone, file uploads, or YouTube. / 使用 AI 模型轉錄或翻譯來自麥克風、上傳檔案或 YouTube 的語音。") |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=4): |
|
|
|
active_tab = gr.State(value="mic") |
|
|
|
|
|
def set_active_tab(tab_name): |
|
return tab_name |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.TabItem("🎤 Microphone / 麥克風") as mic_tab: |
|
gr.Markdown(MIC_PROMPT, elem_classes="compact-markdown") |
|
mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio / 錄製音訊") |
|
download_output = gr.File(label="Download Recording / 下載錄音檔", interactive=False, elem_classes="compact-file") |
|
|
|
with gr.TabItem("📁 Upload File / 上傳檔案") as file_tab: |
|
|
|
file_input = gr.File(label="Upload Audio File / 上傳音訊檔", file_types=["audio"], type="filepath") |
|
|
|
|
|
file_audio_player = gr.Audio(label="Audio Preview / 音訊預覽", interactive=False, visible=False) |
|
|
|
with gr.TabItem("▶️ YouTube") as youtube_tab: |
|
|
|
is_spaces = os.environ.get("SPACE_ID") is not None |
|
|
|
|
|
if is_spaces: |
|
gr.Markdown(""" |
|
⚠️ **注意:YouTube 下載在 Hugging Face Spaces 中不可用** |
|
|
|
由於安全限制,Spaces 環境無法通過 YouTube 的機器人驗證。請在本地環境中使用此功能。 |
|
|
|
⚠️ **Note: YouTube download is not available in Hugging Face Spaces** |
|
|
|
Due to security restrictions, Spaces environment cannot pass YouTube's bot verification. Please use this feature in a local environment. |
|
""", elem_classes="warning-box") |
|
|
|
|
|
youtube_input = gr.Textbox(label="YouTube URL / 網址", placeholder="Paste YouTube link here / 在此貼上 YouTube 連結") |
|
|
|
|
|
with gr.Row(): |
|
youtube_audio_player = gr.Audio(label="YouTube Audio / YouTube 音訊", interactive=False, visible=False) |
|
youtube_download = gr.File(label="Download YouTube Audio / 下載 YouTube 音訊", interactive=False, visible=False, elem_classes="compact-file") |
|
|
|
|
|
def process_example_url(url): |
|
"""處理範例 URL 的函數""" |
|
|
|
is_spaces = os.environ.get("SPACE_ID") is not None |
|
|
|
|
|
is_startup = True |
|
|
|
|
|
if is_spaces: |
|
print("Skipping example URL processing in Spaces environment") |
|
return gr.update(visible=False, value=None), gr.update(visible=False, value=None) |
|
|
|
|
|
return process_youtube_url(url) |
|
|
|
gr.Examples( |
|
examples=[["https://www.youtube.com/watch?v=5D7l0tqQJ7k"]], |
|
inputs=[youtube_input], |
|
label="Example YouTube URL / 範例 YouTube 網址", |
|
fn=process_example_url, |
|
outputs=[youtube_audio_player, youtube_download] |
|
) |
|
|
|
|
|
mic_tab.select(fn=lambda: set_active_tab("mic"), inputs=[], outputs=[active_tab]) |
|
file_tab.select(fn=lambda: set_active_tab("file"), inputs=[], outputs=[active_tab]) |
|
youtube_tab.select(fn=lambda: set_active_tab("youtube"), inputs=[], outputs=[active_tab]) |
|
|
|
|
|
with gr.Column(elem_id="options-block"): |
|
model_select = gr.Dropdown(choices=MODEL_CHOICES_WITH_PARAMS, label="Model / 模型", value=DEFAULT_MODEL, elem_classes="compact-label") |
|
|
|
cpu_info, gpu_info = get_hardware_info() |
|
device_choices = [(f"CPU ({cpu_info})", "cpu")] |
|
if torch.cuda.is_available() and gpu_info: |
|
device_choices.append((f"GPU ({gpu_info})", "gpu")) |
|
device_input = gr.Radio(choices=device_choices, label="Device / 設備", value="cpu", elem_classes="compact-label radio-align") |
|
task_input = gr.Radio(choices=[("Transcribe / 轉錄", "transcribe"), ("Translate / 轉譯", "translate")], label="Task / 任務", value="transcribe", elem_classes="compact-label radio-align") |
|
language_input = gr.Dropdown(choices=WHISPER_LANGUAGES_LIST, label="Source Language / 來源語言", value="auto", elem_classes="compact-label") |
|
|
|
phi4_prompt_input = gr.Textbox(label="Only for Phi-4 Prompt / 僅用於 Phi-4 指令", placeholder="e.g., Transcribe the audio to text.", lines=1, visible=False, elem_classes="compact-label") |
|
timestamp_input = gr.Checkbox(label="Show Timestamps / 顯示時間戳", value=False, elem_classes="compact-label checkbox-align") |
|
|
|
|
|
with gr.Column(scale=6): |
|
submit_button = gr.Button("Submit / 提交", variant="primary") |
|
output_text = gr.Textbox( |
|
label="Result / 結果", |
|
lines=25, |
|
max_lines=25, |
|
interactive=True, |
|
placeholder="Results appear here (new results appended). / 結果將顯示在此 (新結果會附加在後面)", |
|
elem_classes="result-textbox", |
|
autoscroll=False |
|
) |
|
|
|
|
|
model_select.change(fn=update_language_choices, inputs=model_select, outputs=language_input) |
|
model_select.change(fn=update_task_choices, inputs=[model_select], outputs=[task_input]) |
|
|
|
model_select.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input]) |
|
task_input.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input]) |
|
language_input.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input]) |
|
|
|
|
|
task_input.change(fn=update_language_ui, inputs=[model_select, task_input], outputs=language_input) |
|
model_select.change(fn=update_language_ui, inputs=[model_select, task_input], outputs=language_input) |
|
|
|
model_select.change(fn=update_timestamp_visibility, inputs=model_select, outputs=timestamp_input) |
|
|
|
|
|
mic_input.change(fn=update_download_file, inputs=mic_input, outputs=download_output) |
|
|
|
|
|
file_input.change(fn=update_file_audio_player, inputs=file_input, outputs=file_audio_player) |
|
|
|
|
|
youtube_input.change( |
|
fn=process_youtube_url, |
|
inputs=youtube_input, |
|
outputs=[youtube_audio_player, youtube_download], |
|
show_progress=True |
|
) |
|
|
|
|
|
def transcribe_audio_with_error_handling(*args): |
|
try: |
|
|
|
selected_model_identifier = args[3] |
|
model_name_for_display = selected_model_identifier |
|
|
|
|
|
audio_source = None |
|
active_tab = args[-1] |
|
|
|
if active_tab == "mic" and args[0] is not None: |
|
audio_source = args[0] |
|
elif active_tab == "file" and args[1] is not None: |
|
if isinstance(args[1], list) and len(args[1]) > 0: |
|
audio_source = args[1][0] |
|
else: |
|
audio_source = args[1] |
|
elif active_tab == "youtube" and args[2] and args[2].strip(): |
|
|
|
pass |
|
|
|
|
|
if audio_source and os.path.exists(audio_source): |
|
print(f"Processing audio file: {audio_source}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("NEW TRANSCRIPTION PROCESS STARTED") |
|
print("="*50 + "\n") |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
result = transcribe_audio(*args) |
|
|
|
|
|
elapsed_time = time.time() - start_time |
|
|
|
|
|
print("DEBUG: Result type:", type(result)) |
|
print("DEBUG: Final result:", result) |
|
|
|
|
|
if isinstance(result, str): |
|
if result.strip() == ".": |
|
|
|
print("DEBUG: Detected dot-only output in handler, fixing...") |
|
|
|
|
|
|
|
model_info = f"Model / 模型: {model_name_for_display}" |
|
inference_time_info = f"Processing Time / 處理時間: {elapsed_time:.2f} seconds / 秒" |
|
|
|
|
|
|
|
final_text = f"{model_info}\n{inference_time_info}\n\nResult Text / 結果文字:\n" |
|
final_text += "(Please check console for complete transcription / 請查看控制台獲取完整轉錄)" |
|
|
|
print("DEBUG: Created replacement result:", final_text[:100] + "..." if len(final_text) > 100 else final_text) |
|
else: |
|
|
|
final_text = result |
|
print("DEBUG: Using original result text") |
|
else: |
|
|
|
final_text = f"Model / 模型: {model_name_for_display}\n" |
|
final_text += f"Processing Time / 處理時間: {elapsed_time:.2f} seconds / 秒\n\n" |
|
final_text += "(No text detected in audio / 音頻中未檢測到文字)" |
|
print("DEBUG: Created new result for non-string:", final_text[:100] + "..." if len(final_text) > 100 else final_text) |
|
|
|
return final_text, gr.update(), gr.update(), gr.update() |
|
except Exception as e: |
|
import traceback |
|
error_msg = f"Error during processing: {str(e)}\n\n{traceback.format_exc()}" |
|
print(error_msg) |
|
|
|
|
|
return f"處理過程中發生錯誤 / Error during processing:\n{str(e)}", gr.update(), gr.update(), gr.update() |
|
|
|
|
|
submit_button.click( |
|
fn=transcribe_audio_with_error_handling, |
|
inputs=[mic_input, file_input, youtube_input, model_select, task_input, language_input, timestamp_input, phi4_prompt_input, device_input, output_text, active_tab], |
|
outputs=[output_text, mic_input, file_input, youtube_input], |
|
show_progress="full" |
|
) |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
cpu_info, gpu_info = get_hardware_info() |
|
has_gpu = gpu_info is not None |
|
|
|
print(f"CPU: {cpu_info}") |
|
if has_gpu: |
|
print(f"GPU: {gpu_info}") |
|
else: |
|
print("No GPU detected") |
|
|
|
|
|
|
|
|
|
import os |
|
is_spaces = os.environ.get("SPACE_ID") is not None |
|
|
|
|
|
demo.launch( |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
) |
|
|