Spaces:
Runtime error
Runtime error
| """ | |
| Used to transcribe all audio files in one folder into another folder. | |
| e.g. | |
| Directory structure: | |
| --pre_data_root | |
| ----SP_1 | |
| ------01.wav | |
| ------02.wav | |
| ------...... | |
| ----SP_2 | |
| ------01.wav | |
| ------02.wav | |
| ------...... | |
| Use | |
| python tools/whisper_asr.py --audio_dir pre_data_root/SP_1 --save_dir data/SP_1 | |
| to transcribe the first speaker. | |
| Use | |
| python tools/whisper_asr.py --audio_dir pre_data_root/SP_2 --save_dir data/SP_2 | |
| to transcribe the second speaker. | |
| Note: Be aware of your audio sample rate, which defaults to 44.1kHz. | |
| """ | |
| from pathlib import Path | |
| import click | |
| import librosa | |
| import soundfile as sf | |
| import whisper | |
| from loguru import logger | |
| from merge_asr_files import merge_and_delete_files | |
| from tqdm import tqdm | |
| from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files | |
| def main(model_size, audio_dir, save_dir, sample_rate, device, language): | |
| logger.info("Loading / Downloading OpenAI Whisper model...") | |
| model = whisper.load_model( | |
| name=model_size, | |
| device=device, | |
| download_root=str(Path(".cache/whisper").resolve()), | |
| ) | |
| logger.info("Model loaded.") | |
| save_path = Path(save_dir) | |
| save_path.mkdir(parents=True, exist_ok=True) | |
| original_files = [] | |
| audio_files = list_files( | |
| path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True | |
| ) | |
| for file_path in tqdm(audio_files, desc="Processing audio file"): | |
| file_stem = file_path.stem | |
| file_suffix = file_path.suffix | |
| rel_path = Path(file_path).relative_to(audio_dir) | |
| (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True) | |
| if (save_path / rel_path.parent / f"{rel_path.stem}.wav").exists() and ( | |
| save_path / rel_path.parent / f"{rel_path.stem}.lab" | |
| ).exists(): | |
| continue | |
| audio, sr = librosa.load(file_path, sr=sample_rate, mono=False) | |
| transcription = model.transcribe(str(file_path), language=language) | |
| for segment in transcription.get("segments", []): | |
| id, text, start, end = ( | |
| segment["id"], | |
| segment["text"], | |
| segment["start"], | |
| segment["end"], | |
| ) | |
| extract = audio[..., int(start * sr) : int(end * sr)] | |
| audio_save_path = ( | |
| save_path / rel_path.parent / f"{file_stem}-{id}{file_suffix}" | |
| ) | |
| sf.write( | |
| audio_save_path, | |
| extract, | |
| samplerate=sr, | |
| ) | |
| original_files.append(audio_save_path) | |
| transcript_save_path = save_path / rel_path.parent / f"{file_stem}-{id}.lab" | |
| with open( | |
| transcript_save_path, | |
| "w", | |
| encoding="utf-8", | |
| ) as f: | |
| f.write(text) | |
| original_files.append(transcript_save_path) | |
| merge_and_delete_files(save_dir, original_files) | |
| if __name__ == "__main__": | |
| main() | |