|
--- |
|
language: |
|
- en |
|
- zh |
|
- de |
|
- es |
|
- ru |
|
- ko |
|
- fr |
|
- ja |
|
- pt |
|
- tr |
|
- pl |
|
- ca |
|
- nl |
|
- ar |
|
- sv |
|
- it |
|
- id |
|
- hi |
|
- fi |
|
- vi |
|
- he |
|
- uk |
|
- el |
|
- ms |
|
- cs |
|
- ro |
|
- da |
|
- hu |
|
- ta |
|
- no |
|
- th |
|
- ur |
|
- hr |
|
- bg |
|
- lt |
|
- la |
|
- mi |
|
- ml |
|
- cy |
|
- sk |
|
- te |
|
- fa |
|
- lv |
|
- bn |
|
- sr |
|
- az |
|
- sl |
|
- kn |
|
- et |
|
- mk |
|
- br |
|
- eu |
|
- is |
|
- hy |
|
- ne |
|
- mn |
|
- bs |
|
- kk |
|
- sq |
|
- sw |
|
- gl |
|
- mr |
|
- pa |
|
- si |
|
- km |
|
- sn |
|
- yo |
|
- so |
|
- af |
|
- oc |
|
- ka |
|
- be |
|
- tg |
|
- sd |
|
- gu |
|
- am |
|
- yi |
|
- lo |
|
- uz |
|
- fo |
|
- ht |
|
- ps |
|
- tk |
|
- nn |
|
- mt |
|
- sa |
|
- lb |
|
- my |
|
- bo |
|
- tl |
|
- mg |
|
- as |
|
- tt |
|
- haw |
|
- ln |
|
- ha |
|
- ba |
|
- jw |
|
- su |
|
tags: |
|
- audio |
|
- automatic-speech-recognition |
|
- hf-asr-leaderboard |
|
pipeline_tag: automatic-speech-recognition |
|
license: apache-2.0 |
|
license_link: https://choosealicense.com/licenses/apache-2.0/ |
|
--- |
|
|
|
# whisper-large-v3-fp16-ov |
|
* Model creator: [OpenAI](https://huggingface.co/openai) |
|
* Original model: [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) |
|
|
|
## Description |
|
This is [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) model converted to the [OpenVINO™ IR](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html) (Intermediate Representation) format with weights compressed to FP16. |
|
|
|
## Compatibility |
|
|
|
The provided OpenVINO™ IR model is compatible with: |
|
|
|
* OpenVINO version 2025.2.0 and higher |
|
* Optimum Intel 1.23.0 and higher |
|
|
|
|
|
```bash |
|
optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int8 --disable-stateful whisper-large-v3-turbo-int8-ov |
|
``` |
|
|
|
|
|
```python |
|
```python |
|
#!/usr/bin/env python3 |
|
import time |
|
import requests |
|
import openvino_genai |
|
import librosa |
|
from pathlib import Path |
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
def download_model(model_id="FluidInference/whisper-large-v3-turbo-int8-ov-npu"): |
|
"""Download model from HuggingFace Hub""" |
|
local_dir = Path("models") / model_id.split("/")[-1] |
|
|
|
if local_dir.exists() and any(local_dir.iterdir()): |
|
return str(local_dir) |
|
|
|
print(f"Downloading model...") |
|
snapshot_download( |
|
repo_id=model_id, |
|
local_dir=str(local_dir), |
|
local_dir_use_symlinks=False |
|
) |
|
return str(local_dir) |
|
|
|
|
|
def download_hf_audio_samples(): |
|
"""Download audio samples from Hugging Face""" |
|
samples_dir = Path("sample_audios") |
|
samples_dir.mkdir(exist_ok=True) |
|
|
|
downloaded = [] |
|
whisper_samples = [ |
|
("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"), |
|
("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"), |
|
] |
|
|
|
for url, filename in whisper_samples: |
|
filepath = samples_dir / filename |
|
if filepath.exists(): |
|
downloaded.append(str(filepath)) |
|
continue |
|
|
|
try: |
|
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) |
|
response.raise_for_status() |
|
|
|
with open(filepath, 'wb') as f: |
|
f.write(response.content) |
|
|
|
downloaded.append(str(filepath)) |
|
except Exception as e: |
|
print(f"Error downloading {filename}: {e}") |
|
|
|
return downloaded |
|
|
|
|
|
def read_audio(filepath): |
|
"""Read audio file and convert to 16kHz""" |
|
try: |
|
raw_speech, _ = librosa.load(filepath, sr=16000) |
|
return raw_speech.tolist() |
|
except Exception as e: |
|
print(f"Error reading {filepath}: {e}") |
|
return None |
|
|
|
|
|
def test_whisper_on_file(pipe, filepath): |
|
"""Test Whisper on a single audio file""" |
|
config = pipe.get_generation_config() |
|
config.language = "<|en|>" |
|
config.task = "transcribe" |
|
config.return_timestamps = True |
|
config.max_new_tokens = 448 |
|
|
|
raw_speech = read_audio(filepath) |
|
if raw_speech is None: |
|
return None |
|
|
|
duration = len(raw_speech) / 16000 |
|
|
|
start_time = time.time() |
|
result = pipe.generate(raw_speech, config) |
|
inference_time = time.time() - start_time |
|
|
|
return { |
|
"file": filepath, |
|
"duration": duration, |
|
"inference_time": inference_time, |
|
"rtf": inference_time/duration, |
|
"transcription": str(result) |
|
} |
|
|
|
|
|
def main(): |
|
# Download model |
|
model_path = download_model() |
|
|
|
# Initialize pipeline on NPU |
|
print(f"\nInitializing NPU...") |
|
start_time = time.time() |
|
pipe = openvino_genai.WhisperPipeline(model_path, "NPU") |
|
init_time = time.time() - start_time |
|
|
|
results = [] |
|
|
|
# Collect test files |
|
test_files = [] |
|
test_files.extend(Path(".").glob("*.wav")) |
|
|
|
if Path("samples/c/whisper_speech_recognition").exists(): |
|
test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav")) |
|
|
|
# Download HF samples |
|
hf_samples = download_hf_audio_samples() |
|
test_files.extend([Path(f) for f in hf_samples]) |
|
|
|
# Test all files |
|
print(f"\nTesting {len(test_files)} files...") |
|
for audio_file in test_files: |
|
result = test_whisper_on_file(pipe, str(audio_file)) |
|
if result: |
|
results.append(result) |
|
print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x") |
|
|
|
# Print summary |
|
if results: |
|
total_duration = sum(r["duration"] for r in results) |
|
total_inference = sum(r["inference_time"] for r in results) |
|
avg_rtf = total_inference / total_duration |
|
|
|
print(f"\n{'='*50}") |
|
print(f"NPU Performance Summary") |
|
print(f"{'='*50}") |
|
print(f"Model load time: {init_time:.1f}s") |
|
print(f"Files tested: {len(results)}") |
|
print(f"Total audio: {total_duration:.1f}s") |
|
print(f"Total inference: {total_inference:.1f}s") |
|
print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}") |
|
|
|
print(f"\nResults:") |
|
for r in results: |
|
trans = r['transcription'].strip() |
|
if len(trans) > 60: |
|
trans = trans[:57] + "..." |
|
print(f"- {Path(r['file']).name}: \"{trans}\"") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
``` |
|
``` |