Upload source code

#1
by Xenova HF staff - opened
whisper-speaker-diarization/.eslintrc.cjs ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ root: true,
3
+ env: { browser: true, es2020: true },
4
+ extends: [
5
+ 'eslint:recommended',
6
+ 'plugin:react/recommended',
7
+ 'plugin:react/jsx-runtime',
8
+ 'plugin:react-hooks/recommended',
9
+ ],
10
+ ignorePatterns: ['dist', '.eslintrc.cjs'],
11
+ parserOptions: { ecmaVersion: 'latest', sourceType: 'module' },
12
+ settings: { react: { version: '18.2' } },
13
+ plugins: ['react-refresh'],
14
+ rules: {
15
+ 'react/jsx-no-target-blank': 'off',
16
+ 'react-refresh/only-export-components': [
17
+ 'warn',
18
+ { allowConstantExport: true },
19
+ ],
20
+ },
21
+ }
whisper-speaker-diarization/.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
whisper-speaker-diarization/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # React + Vite
2
+
3
+ This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
4
+
5
+ Currently, two official plugins are available:
6
+
7
+ - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh
8
+ - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
whisper-speaker-diarization/index.html ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Whisper Diarization</title>
7
+ </head>
8
+ <body>
9
+ <div id="root"></div>
10
+ <script type="module" src="/src/main.jsx"></script>
11
+ </body>
12
+ </html>
whisper-speaker-diarization/package.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "whisper-speaker-diarization",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "lint": "eslint . --ext js,jsx --report-unused-disable-directives --max-warnings 0",
10
+ "preview": "vite preview"
11
+ },
12
+ "dependencies": {
13
+ "@xenova/transformers": "github:xenova/transformers.js#v3",
14
+ "react": "^18.3.1",
15
+ "react-dom": "^18.3.1"
16
+ },
17
+ "devDependencies": {
18
+ "@types/react": "^18.3.3",
19
+ "@types/react-dom": "^18.3.0",
20
+ "@vitejs/plugin-react": "^4.3.1",
21
+ "autoprefixer": "^10.4.19",
22
+ "eslint": "^8.57.0",
23
+ "eslint-plugin-react": "^7.34.2",
24
+ "eslint-plugin-react-hooks": "^4.6.2",
25
+ "eslint-plugin-react-refresh": "^0.4.7",
26
+ "postcss": "^8.4.38",
27
+ "tailwindcss": "^3.4.4",
28
+ "vite": "^5.3.1"
29
+ }
30
+ }
whisper-speaker-diarization/postcss.config.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ export default {
2
+ plugins: {
3
+ tailwindcss: {},
4
+ autoprefixer: {},
5
+ },
6
+ }
whisper-speaker-diarization/src/App.jsx ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useState, useRef, useCallback } from 'react';
2
+
3
+ import Progress from './components/Progress';
4
+ import MediaInput from './components/MediaInput';
5
+ import Transcript from './components/Transcript';
6
+ import LanguageSelector from './components/LanguageSelector';
7
+
8
+
9
+ async function hasWebGPU() {
10
+ if (!navigator.gpu) {
11
+ return false;
12
+ }
13
+ try {
14
+ const adapter = await navigator.gpu.requestAdapter();
15
+ return !!adapter;
16
+ } catch (e) {
17
+ return false;
18
+ }
19
+ }
20
+
21
+ function App() {
22
+
23
+ // Create a reference to the worker object.
24
+ const worker = useRef(null);
25
+
26
+ // Model loading and progress
27
+ const [status, setStatus] = useState(null);
28
+ const [loadingMessage, setLoadingMessage] = useState('');
29
+ const [progressItems, setProgressItems] = useState([]);
30
+
31
+ const mediaInputRef = useRef(null);
32
+ const [audio, setAudio] = useState(null);
33
+ const [language, setLanguage] = useState('en');
34
+
35
+ const [result, setResult] = useState(null);
36
+ const [time, setTime] = useState(null);
37
+ const [currentTime, setCurrentTime] = useState(0);
38
+
39
+ const [device, setDevice] = useState('webgpu'); // Try use WebGPU first
40
+ const [modelSize, setModelSize] = useState('gpu' in navigator ? 196 : 77); // WebGPU=196MB, WebAssembly=77MB
41
+ useEffect(() => {
42
+ hasWebGPU().then((b) => {
43
+ setModelSize(b ? 196 : 77);
44
+ setDevice(b ? 'webgpu' : 'wasm');
45
+ });
46
+ }, []);
47
+
48
+ // We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
49
+ useEffect(() => {
50
+ if (!worker.current) {
51
+ // Create the worker if it does not yet exist.
52
+ worker.current = new Worker(new URL('./worker.js', import.meta.url), {
53
+ type: 'module'
54
+ });
55
+ }
56
+
57
+ // Create a callback function for messages from the worker thread.
58
+ const onMessageReceived = (e) => {
59
+ switch (e.data.status) {
60
+ case 'loading':
61
+ // Model file start load: add a new progress item to the list.
62
+ setStatus('loading');
63
+ setLoadingMessage(e.data.data);
64
+ break;
65
+
66
+ case 'initiate':
67
+ setProgressItems(prev => [...prev, e.data]);
68
+ break;
69
+
70
+ case 'progress':
71
+ // Model file progress: update one of the progress items.
72
+ setProgressItems(
73
+ prev => prev.map(item => {
74
+ if (item.file === e.data.file) {
75
+ return { ...item, ...e.data }
76
+ }
77
+ return item;
78
+ })
79
+ );
80
+ break;
81
+
82
+ case 'done':
83
+ // Model file loaded: remove the progress item from the list.
84
+ setProgressItems(
85
+ prev => prev.filter(item => item.file !== e.data.file)
86
+ );
87
+ break;
88
+
89
+ case 'loaded':
90
+ // Pipeline ready: the worker is ready to accept messages.
91
+ setStatus('ready');
92
+ break;
93
+
94
+ case 'complete':
95
+ setResult(e.data.result);
96
+ setTime(e.data.time);
97
+ setStatus('ready');
98
+ break;
99
+ }
100
+ };
101
+
102
+ // Attach the callback function as an event listener.
103
+ worker.current.addEventListener('message', onMessageReceived);
104
+
105
+ // Define a cleanup function for when the component is unmounted.
106
+ return () => {
107
+ worker.current.removeEventListener('message', onMessageReceived);
108
+ };
109
+ }, []);
110
+
111
+ const handleClick = useCallback(() => {
112
+ setResult(null);
113
+ setTime(null);
114
+ if (status === null) {
115
+ setStatus('loading');
116
+ worker.current.postMessage({ type: 'load', data: { device } });
117
+ } else {
118
+ setStatus('running');
119
+ worker.current.postMessage({
120
+ type: 'run', data: { audio, language }
121
+ });
122
+ }
123
+ }, [status, audio, language, device]);
124
+
125
+ return (
126
+ <div className="flex flex-col h-screen mx-auto text-gray-800 dark:text-gray-200 bg-white dark:bg-gray-900 max-w-[600px]">
127
+
128
+ {status === 'loading' && (
129
+ <div className="flex justify-center items-center fixed w-screen h-screen bg-black z-10 bg-opacity-[92%] top-0 left-0">
130
+ <div className="w-[500px]">
131
+ <p className="text-center mb-1 text-white text-md">{loadingMessage}</p>
132
+ {progressItems.map(({ file, progress, total }, i) => (
133
+ <Progress key={i} text={file} percentage={progress} total={total} />
134
+ ))}
135
+ </div>
136
+ </div>
137
+ )}
138
+ <div className="my-auto">
139
+ <div className="flex flex-col items-center mb-2 text-center">
140
+ <h1 className="text-5xl font-bold mb-2">Whisper Diarization</h1>
141
+ <h2 className="text-xl font-semibold">In-browser automatic speech recognition w/ <br />word-level timestamps and speaker segmentation</h2>
142
+ </div>
143
+
144
+ <div className="w-full min-h-[220px] flex flex-col justify-center items-center">
145
+ {
146
+ !audio && (
147
+ <p className="mb-2">
148
+ You are about to download <a href="https://huggingface.co/onnx-community/whisper-base_timestamped" target="_blank" rel="noreferrer" className="font-medium underline">whisper-base</a> and <a href="https://huggingface.co/onnx-community/pyannote-segmentation-3.0" target="_blank" rel="noreferrer" className="font-medium underline">pyannote-segmentation-3.0</a>,
149
+ two powerful speech recognition models for generating word-level timestamps across 100 different languages and speaker segmentation, respectively.
150
+ Once loaded, the models ({modelSize}MB + 6MB) will be cached and reused when you revisit the page.<br />
151
+ <br />
152
+ Everything runs locally in your browser using <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline">🤗&nbsp;Transformers.js</a> and ONNX Runtime Web,
153
+ meaning no API calls are made to a server for inference. You can even disconnect from the internet after the model has loaded!
154
+ </p>
155
+ )
156
+ }
157
+
158
+ <div className="flex flex-col w-full m-3 max-w-[520px]">
159
+ <span className="text-sm mb-0.5">Input audio/video</span>
160
+ <MediaInput
161
+ ref={mediaInputRef}
162
+ className="flex items-center border rounded-md cursor-pointer min-h-[100px] max-h-[500px] overflow-hidden"
163
+ onInputChange={(audio) => {
164
+ setResult(null);
165
+ setAudio(audio);
166
+ }}
167
+ onTimeUpdate={(time) => setCurrentTime(time)}
168
+ />
169
+ </div>
170
+
171
+ <div className="relative w-full flex justify-center items-center">
172
+ <button
173
+ className="border px-4 py-2 rounded-lg bg-blue-400 text-white hover:bg-blue-500 disabled:bg-blue-100 disabled:cursor-not-allowed select-none"
174
+ onClick={handleClick}
175
+ disabled={status === 'running' || (status !== null && audio === null)}
176
+ >
177
+ {status === null ? 'Load model' :
178
+ status === 'running'
179
+ ? 'Running...'
180
+ : 'Run model'
181
+ }
182
+ </button>
183
+
184
+ {status !== null &&
185
+ <div className='absolute right-0 bottom-0'>
186
+ <span className="text-xs">Language:</span>
187
+ <br />
188
+ <LanguageSelector className="border rounded-lg p-1 max-w-[100px]" language={language} setLanguage={setLanguage} />
189
+ </div>
190
+ }
191
+ </div>
192
+
193
+ {
194
+ result && time && (
195
+ <>
196
+ <div className="w-full mt-4 border rounded-md">
197
+ <Transcript
198
+ className="p-2 max-h-[200px] overflow-y-auto scrollbar-thin select-none"
199
+ transcript={result.transcript}
200
+ segments={result.segments}
201
+ currentTime={currentTime}
202
+ setCurrentTime={(time) => {
203
+ setCurrentTime(time);
204
+ mediaInputRef.current.setMediaTime(time);
205
+ }}
206
+ />
207
+ </div>
208
+ <p className="text-sm text-gray-600 text-end p-1">Generation time: <span className="text-gray-800 font-semibold">{time.toFixed(2)}ms</span></p>
209
+ </>
210
+ )
211
+ }
212
+ </div>
213
+ </div>
214
+ </div >
215
+ )
216
+ }
217
+
218
+ export default App
whisper-speaker-diarization/src/components/LanguageSelector.jsx ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ function titleCase(str) {
3
+ str = str.toLowerCase();
4
+ return (str.match(/\w+.?/g) || [])
5
+ .map((word) => {
6
+ return word.charAt(0).toUpperCase() + word.slice(1);
7
+ })
8
+ .join("");
9
+ }
10
+
11
+ // List of supported languages:
12
+ // https://help.openai.com/en/articles/7031512-whisper-api-faq
13
+ // https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L79
14
+ const LANGUAGES = {
15
+ en: "english",
16
+ zh: "chinese",
17
+ de: "german",
18
+ es: "spanish/castilian",
19
+ ru: "russian",
20
+ ko: "korean",
21
+ fr: "french",
22
+ ja: "japanese",
23
+ pt: "portuguese",
24
+ tr: "turkish",
25
+ pl: "polish",
26
+ ca: "catalan/valencian",
27
+ nl: "dutch/flemish",
28
+ ar: "arabic",
29
+ sv: "swedish",
30
+ it: "italian",
31
+ id: "indonesian",
32
+ hi: "hindi",
33
+ fi: "finnish",
34
+ vi: "vietnamese",
35
+ he: "hebrew",
36
+ uk: "ukrainian",
37
+ el: "greek",
38
+ ms: "malay",
39
+ cs: "czech",
40
+ ro: "romanian/moldavian/moldovan",
41
+ da: "danish",
42
+ hu: "hungarian",
43
+ ta: "tamil",
44
+ no: "norwegian",
45
+ th: "thai",
46
+ ur: "urdu",
47
+ hr: "croatian",
48
+ bg: "bulgarian",
49
+ lt: "lithuanian",
50
+ la: "latin",
51
+ mi: "maori",
52
+ ml: "malayalam",
53
+ cy: "welsh",
54
+ sk: "slovak",
55
+ te: "telugu",
56
+ fa: "persian",
57
+ lv: "latvian",
58
+ bn: "bengali",
59
+ sr: "serbian",
60
+ az: "azerbaijani",
61
+ sl: "slovenian",
62
+ kn: "kannada",
63
+ et: "estonian",
64
+ mk: "macedonian",
65
+ br: "breton",
66
+ eu: "basque",
67
+ is: "icelandic",
68
+ hy: "armenian",
69
+ ne: "nepali",
70
+ mn: "mongolian",
71
+ bs: "bosnian",
72
+ kk: "kazakh",
73
+ sq: "albanian",
74
+ sw: "swahili",
75
+ gl: "galician",
76
+ mr: "marathi",
77
+ pa: "punjabi/panjabi",
78
+ si: "sinhala/sinhalese",
79
+ km: "khmer",
80
+ sn: "shona",
81
+ yo: "yoruba",
82
+ so: "somali",
83
+ af: "afrikaans",
84
+ oc: "occitan",
85
+ ka: "georgian",
86
+ be: "belarusian",
87
+ tg: "tajik",
88
+ sd: "sindhi",
89
+ gu: "gujarati",
90
+ am: "amharic",
91
+ yi: "yiddish",
92
+ lo: "lao",
93
+ uz: "uzbek",
94
+ fo: "faroese",
95
+ ht: "haitian creole/haitian",
96
+ ps: "pashto/pushto",
97
+ tk: "turkmen",
98
+ nn: "nynorsk",
99
+ mt: "maltese",
100
+ sa: "sanskrit",
101
+ lb: "luxembourgish/letzeburgesch",
102
+ my: "myanmar/burmese",
103
+ bo: "tibetan",
104
+ tl: "tagalog",
105
+ mg: "malagasy",
106
+ as: "assamese",
107
+ tt: "tatar",
108
+ haw: "hawaiian",
109
+ ln: "lingala",
110
+ ha: "hausa",
111
+ ba: "bashkir",
112
+ jw: "javanese",
113
+ su: "sundanese",
114
+ };
115
+ function LanguageSelector({ language, setLanguage, ...props }) {
116
+ const handleLanguageChange = (event) => {
117
+ setLanguage(event.target.value);
118
+ };
119
+
120
+ const names = Object.values(LANGUAGES).map(titleCase);
121
+
122
+ return (
123
+ <select
124
+ {...props}
125
+ value={language} onChange={handleLanguageChange}>
126
+ {Object.keys(LANGUAGES).map((key, i) => (
127
+ <option key={key} value={key}>
128
+ {names[i]}
129
+ </option>
130
+ ))}
131
+ </select>
132
+ );
133
+ }
134
+ export default LanguageSelector
whisper-speaker-diarization/src/components/MediaInput.jsx ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, forwardRef, useRef, useImperativeHandle, useEffect, useCallback } from 'react';
2
+
3
+ const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/hopper.webm';
4
+
5
+ const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, ...props }, ref) => {
6
+ // UI states
7
+ const [dragging, setDragging] = useState(false);
8
+ const fileInputRef = useRef(null);
9
+
10
+ // Create a reference to the audio and video elements
11
+ const audioElement = useRef(null);
12
+ const videoElement = useRef(null);
13
+
14
+ const currentTimeRef = useRef(0);
15
+ useImperativeHandle(ref, () => ({
16
+ setMediaTime(time) {
17
+ if (audioElement.current?.src) {
18
+ audioElement.current.currentTime = time;
19
+ } else if (videoElement.current?.src) {
20
+ videoElement.current.currentTime = time;
21
+ }
22
+ currentTimeRef.current = time;
23
+ }
24
+ }));
25
+
26
+ const onBufferLoad = (arrayBuffer, type) => {
27
+ const blob = new Blob([arrayBuffer.slice(0)], { type: type });
28
+ const url = URL.createObjectURL(blob);
29
+ processFile(arrayBuffer);
30
+
31
+ // Create a URL for the Blob
32
+ if (type.startsWith('audio/')) {
33
+ // Dispose the previous source
34
+ videoElement.current.pause();
35
+ videoElement.current.removeAttribute('src');
36
+ videoElement.current.load();
37
+
38
+ audioElement.current.src = url;
39
+ } else if (type.startsWith('video/')) {
40
+ // Dispose the previous source
41
+ audioElement.current.pause();
42
+ audioElement.current.removeAttribute('src');
43
+ audioElement.current.load();
44
+
45
+ videoElement.current.src = url;
46
+ } else {
47
+ alert(`Unsupported file type: ${type}`);
48
+ }
49
+ }
50
+
51
+ const readFile = (file) => {
52
+ if (!file) return;
53
+
54
+ // file.type
55
+ const reader = new FileReader();
56
+ reader.onload = (e) => {
57
+ onBufferLoad(e.target.result, file.type);
58
+ }
59
+ reader.readAsArrayBuffer(file);
60
+ }
61
+
62
+ const handleInputChange = (event) => {
63
+ readFile(event.target.files[0]);
64
+ };
65
+
66
+ const handleDragOver = (event) => {
67
+ event.preventDefault();
68
+ };
69
+
70
+ const handleDrop = (event) => {
71
+ event.preventDefault();
72
+ setDragging(false);
73
+ readFile(event.dataTransfer.files[0]);
74
+ };
75
+
76
+ const handleClick = (e) => {
77
+ if (e.target.tagName === 'VIDEO' || e.target.tagName === 'AUDIO') {
78
+ e.preventDefault();
79
+ fileInputRef.current.click();
80
+ } else if (e.target.tagName === 'INPUT') {
81
+ e.stopPropagation();
82
+ } else {
83
+ fileInputRef.current.click();
84
+ e.stopPropagation();
85
+ }
86
+ };
87
+
88
+ const processFile = async (buffer) => {
89
+ const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16_000 });
90
+
91
+ try {
92
+ const audioBuffer = await audioContext.decodeAudioData(buffer);
93
+ let audio;
94
+ if (audioBuffer.numberOfChannels === 2) {
95
+ // Merge channels
96
+ const SCALING_FACTOR = Math.sqrt(2);
97
+ const left = audioBuffer.getChannelData(0);
98
+ const right = audioBuffer.getChannelData(1);
99
+ audio = new Float32Array(left.length);
100
+ for (let i = 0; i < audioBuffer.length; ++i) {
101
+ audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2;
102
+ }
103
+ } else {
104
+ audio = audioBuffer.getChannelData(0);
105
+ }
106
+ onInputChange(audio);
107
+
108
+ } catch (e) {
109
+ alert(e);
110
+ }
111
+ };
112
+
113
+ const requestRef = useRef();
114
+
115
+ const updateTime = useCallback(() => {
116
+ let elem;
117
+ if (audioElement.current?.src) {
118
+ elem = audioElement.current;
119
+
120
+ } else if (videoElement.current?.src) {
121
+ elem = videoElement.current;
122
+ }
123
+
124
+ if (elem && currentTimeRef.current !== elem.currentTime) {
125
+ currentTimeRef.current = elem.currentTime;
126
+ onTimeUpdate(elem.currentTime);
127
+ }
128
+
129
+ // Request the next frame
130
+ requestRef.current = requestAnimationFrame(updateTime);
131
+ }, [onTimeUpdate]);
132
+
133
+ useEffect(() => {
134
+ // Start the animation
135
+ requestRef.current = requestAnimationFrame(updateTime);
136
+
137
+ return () => {
138
+ // Cleanup on component unmount
139
+ cancelAnimationFrame(requestRef.current);
140
+ };
141
+ }, [updateTime]);
142
+ return (
143
+ <div
144
+ {...props}
145
+ onClick={handleClick}
146
+ onDragOver={handleDragOver}
147
+ onDrop={handleDrop}
148
+ onDragEnter={(e) => setDragging(true)}
149
+ onDragLeave={(e) => setDragging(false)}
150
+ >
151
+ <input
152
+ type="file"
153
+ accept="audio/*,video/*"
154
+ onChange={handleInputChange}
155
+ ref={fileInputRef}
156
+ className="hidden"
157
+ />
158
+ {
159
+ <audio
160
+ ref={audioElement}
161
+ controls
162
+ style={{ display: audioElement.current?.src ? 'block' : 'none' }}
163
+ className='w-full max-h-full'
164
+ />
165
+ }
166
+ {
167
+ <video
168
+ ref={videoElement}
169
+ controls
170
+ style={{ display: videoElement.current?.src ? 'block' : 'none' }}
171
+ className='w-full max-h-full'
172
+ />
173
+ }
174
+ {
175
+ !audioElement.current?.src && !videoElement.current?.src && (
176
+ <div className="w-full flex flex-col items-center justify-center border-2 border-dashed border-gray-300 rounded-md h-[250px]"
177
+ style={{ borderColor: dragging ? 'blue' : 'lightgray' }}
178
+ >
179
+ <span className="text-gray-600 text-center"><u>Drag & drop</u> or <u>click</u><br />to select media</span>
180
+ <span className="text-gray-500 text-sm hover:text-gray-800 mt-2" onClick={async (e) => {
181
+ e.stopPropagation();
182
+ const buffer = await fetch(EXAMPLE_URL).then((r) => r.arrayBuffer());
183
+ videoElement.current.src = URL.createObjectURL(new Blob([buffer], { type: 'video/mp4' }));
184
+ onBufferLoad(buffer, 'video/mp4');
185
+ }}>(or <u>try an example</u>)</span>
186
+ </div>
187
+ )
188
+ }
189
+ </div>
190
+ );
191
+ });
192
+ MediaInput.displayName = 'MediaInput';
193
+
194
+ export default MediaInput;
whisper-speaker-diarization/src/components/Progress.jsx ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function formatBytes(size) {
2
+ const i = size == 0 ? 0 : Math.floor(Math.log(size) / Math.log(1024));
3
+ return +((size / Math.pow(1024, i)).toFixed(2)) * 1 + ['B', 'kB', 'MB', 'GB', 'TB'][i];
4
+ }
5
+
6
+ export default function Progress({ text, percentage, total }) {
7
+ percentage ??= 0;
8
+ return (
9
+ <div className="w-full bg-gray-100 dark:bg-gray-700 text-left rounded-lg overflow-hidden mb-0.5">
10
+ <div className="bg-blue-400 whitespace-nowrap px-1 text-sm" style={{ width: `${percentage}%` }}>
11
+ {text} ({percentage.toFixed(2)}%{isNaN(total) ? '' : ` of ${formatBytes(total)}`})
12
+ </div>
13
+ </div>
14
+ );
15
+ }
whisper-speaker-diarization/src/components/Transcript.jsx ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useMemo, useRef } from "react";
2
+
3
+ const Chunk = ({ chunk, currentTime, onClick, ...props }) => {
4
+ const spanRef = useRef(null);
5
+ const { text, timestamp } = chunk;
6
+ const [start, end] = timestamp;
7
+
8
+ const bolded = start <= currentTime && currentTime < end;
9
+
10
+ useEffect(() => {
11
+ if (spanRef.current && bolded) { // scroll into view
12
+ spanRef.current.scrollIntoView({
13
+ behavior: 'smooth',
14
+ block: 'center',
15
+ inline: 'center',
16
+ });
17
+ }
18
+ }, [bolded]);
19
+
20
+ return (
21
+ <span {...props}>
22
+ {text.startsWith(' ') ? " " : ""}
23
+ <span
24
+ ref={spanRef}
25
+ onClick={onClick}
26
+ className="text-md text-gray-600 cursor-pointer hover:text-red-600"
27
+ title={timestamp.map(x => x.toFixed(2)).join(' → ')}
28
+ style={{
29
+ textDecoration: bolded ? 'underline' : 'none',
30
+ textShadow: bolded ? '0 0 1px #000' : 'none',
31
+ }}
32
+ >{text.trim()}</span>
33
+ </span>
34
+ )
35
+ }
36
+
37
+ const Transcript = ({ transcript, segments, currentTime, setCurrentTime, ...props }) => {
38
+ const jsonTranscript = useMemo(() => {
39
+ return JSON.stringify({
40
+ ...transcript,
41
+ segments,
42
+ }, null, 2)
43
+ // post-process the JSON to make it more readable
44
+ .replace(/( {4}"timestamp": )\[\s+(\S+)\s+(\S+)\s+\]/gm, "$1[$2 $3]");
45
+ }, [transcript, segments]);
46
+
47
+ // Post-process the transcript to highlight speaker changes
48
+ const postProcessedTranscript = useMemo(() => {
49
+ let prev = 0;
50
+ const words = transcript.chunks;
51
+
52
+ const result = [];
53
+ for (const segment of segments) {
54
+ const { label, end } = segment;
55
+ if (label === 'NO_SPEAKER') continue;
56
+
57
+ // Collect all words within this segment
58
+ const segmentWords = [];
59
+ for (let i = prev; i < words.length; ++i) {
60
+ const word = words[i];
61
+ if (word.timestamp[1] <= end) {
62
+ segmentWords.push(word);
63
+ } else {
64
+ prev = i;
65
+ break;
66
+ }
67
+ }
68
+ if (segmentWords.length > 0) {
69
+ result.push({
70
+ ...segment,
71
+ chunks: segmentWords,
72
+ })
73
+ }
74
+ }
75
+ return result;
76
+ }, [transcript, segments]);
77
+
78
+ const downloadTranscript = () => {
79
+ const blob = new Blob([jsonTranscript], { type: 'application/json' });
80
+ const url = URL.createObjectURL(blob);
81
+ const a = document.createElement('a');
82
+ a.href = url;
83
+ a.download = 'transcript.json';
84
+ a.click();
85
+ URL.revokeObjectURL(url);
86
+ }
87
+
88
+ return (<>
89
+ <div {...props}>
90
+ {
91
+ postProcessedTranscript.map(({ label, start, end, chunks }, i) => (
92
+ <div className="border-t py-2" key={i}>
93
+ <div className="flex justify-between">
94
+ <label className="text-xs font-medium">{label}</label>
95
+ <label className="text-xs">{start.toFixed(2)} &rarr; {end.toFixed(2)}</label>
96
+ </div>
97
+ <div>
98
+ {chunks.map((chunk, j) =>
99
+ <Chunk
100
+ key={j}
101
+ chunk={chunk}
102
+ currentTime={currentTime}
103
+ onClick={() => setCurrentTime(chunk.timestamp[0])} // Set to start of chunk
104
+ />
105
+ )}
106
+ </div>
107
+ </div>
108
+ ))
109
+ }
110
+ </div>
111
+
112
+ <div className="flex justify-center border-t text-sm text-gray-600 max-h-[150px] overflow-y-auto p-2 scrollbar-thin">
113
+ <button
114
+ className="flex items-center border px-2 py-1 rounded-lg bg-green-400 text-white hover:bg-green-500"
115
+ onClick={downloadTranscript}
116
+ >
117
+ <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" strokeWidth={1.5} stroke="currentColor" className="size-6 mr-1">
118
+ <path strokeLinecap="round" strokeLinejoin="round" d="M3 16.5v2.25A2.25 2.25 0 0 0 5.25 21h13.5A2.25 2.25 0 0 0 21 18.75V16.5M16.5 12 12 16.5m0 0L7.5 12m4.5 4.5V3" />
119
+ </svg>
120
+ Download transcript
121
+ </button>
122
+ </div>
123
+ </>)
124
+ };
125
+ export default Transcript;
whisper-speaker-diarization/src/index.css ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @tailwind base;
2
+ @tailwind components;
3
+ @tailwind utilities;
4
+
5
+ @layer utilities {
6
+ .scrollbar-thin::-webkit-scrollbar {
7
+ @apply w-2;
8
+ }
9
+
10
+ .scrollbar-thin::-webkit-scrollbar-track {
11
+ @apply rounded-full bg-gray-100 dark:bg-gray-700;
12
+ }
13
+
14
+ .scrollbar-thin::-webkit-scrollbar-thumb {
15
+ @apply rounded-full bg-gray-300 dark:bg-gray-600;
16
+ }
17
+
18
+ .scrollbar-thin::-webkit-scrollbar-thumb:hover {
19
+ @apply bg-gray-500;
20
+ }
21
+ }
22
+
23
+ html {
24
+ @apply scrollbar-thin;
25
+ }
whisper-speaker-diarization/src/main.jsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react'
2
+ import ReactDOM from 'react-dom/client'
3
+ import App from './App.jsx'
4
+ import './index.css'
5
+
6
+ ReactDOM.createRoot(document.getElementById('root')).render(
7
+ <React.StrictMode>
8
+ <App />
9
+ </React.StrictMode>,
10
+ )
whisper-speaker-diarization/src/worker.js ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import { pipeline, AutoProcessor, AutoModelForAudioFrameClassification } from '@xenova/transformers';
3
+
4
+ const PER_DEVICE_CONFIG = {
5
+ webgpu: {
6
+ dtype: {
7
+ encoder_model: 'fp32',
8
+ decoder_model_merged: 'q4',
9
+ },
10
+ device: 'webgpu',
11
+ },
12
+ wasm: {
13
+ dtype: 'q8',
14
+ device: 'wasm',
15
+ },
16
+ };
17
+
18
+ /**
19
+ * This class uses the Singleton pattern to ensure that only one instance of the model is loaded.
20
+ */
21
+ class PipelineSingeton {
22
+ static asr_model_id = 'onnx-community/whisper-base_timestamped';
23
+ static asr_instance = null;
24
+
25
+ static segmentation_model_id = 'onnx-community/pyannote-segmentation-3.0';
26
+ static segmentation_instance = null;
27
+ static segmentation_processor = null;
28
+
29
+ static async getInstance(progress_callback = null, device = 'webgpu') {
30
+ this.asr_instance ??= pipeline('automatic-speech-recognition', this.asr_model_id, {
31
+ ...PER_DEVICE_CONFIG[device],
32
+ progress_callback,
33
+ });
34
+
35
+ this.segmentation_processor ??= AutoProcessor.from_pretrained(this.segmentation_model_id, {
36
+ progress_callback,
37
+ });
38
+ this.segmentation_instance ??= AutoModelForAudioFrameClassification.from_pretrained(this.segmentation_model_id, {
39
+ // NOTE: WebGPU is not currently supported for this model
40
+ // See https://github.com/microsoft/onnxruntime/issues/21386
41
+ device: 'wasm',
42
+ dtype: 'fp32',
43
+ progress_callback,
44
+ });
45
+
46
+ return Promise.all([this.asr_instance, this.segmentation_processor, this.segmentation_instance]);
47
+ }
48
+ }
49
+
50
+ async function load({ device }) {
51
+ self.postMessage({
52
+ status: 'loading',
53
+ data: `Loading models (${device})...`
54
+ });
55
+
56
+ // Load the pipeline and save it for future use.
57
+ const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance(x => {
58
+ // We also add a progress callback to the pipeline so that we can
59
+ // track model loading.
60
+ self.postMessage(x);
61
+ }, device);
62
+
63
+ if (device === 'webgpu') {
64
+ self.postMessage({
65
+ status: 'loading',
66
+ data: 'Compiling shaders and warming up model...'
67
+ });
68
+
69
+ await transcriber(new Float32Array(16_000), {
70
+ language: 'en',
71
+ });
72
+ }
73
+
74
+ self.postMessage({ status: 'loaded' });
75
+ }
76
+
77
+ async function segment(processor, model, audio) {
78
+ const inputs = await processor(audio);
79
+ const { logits } = await model(inputs);
80
+ const segments = processor.post_process_speaker_diarization(logits, audio.length)[0];
81
+
82
+ // Attach labels
83
+ for (const segment of segments) {
84
+ segment.label = model.config.id2label[segment.id];
85
+ }
86
+
87
+ return segments;
88
+ }
89
+
90
+ async function run({ audio, language }) {
91
+ const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance();
92
+
93
+ const start = performance.now();
94
+
95
+ // Run transcription and segmentation in parallel
96
+ const [transcript, segments] = await Promise.all([
97
+ transcriber(audio, {
98
+ language,
99
+ return_timestamps: 'word',
100
+ chunk_length_s: 30,
101
+ }),
102
+ segment(segmentation_processor, segmentation_model, audio)
103
+ ]);
104
+ console.table(segments, ['start', 'end', 'id', 'label', 'confidence']);
105
+
106
+ const end = performance.now();
107
+
108
+ self.postMessage({ status: 'complete', result: { transcript, segments }, time: end - start });
109
+ }
110
+
111
+ // Listen for messages from the main thread
112
+ self.addEventListener('message', async (e) => {
113
+ const { type, data } = e.data;
114
+
115
+ switch (type) {
116
+ case 'load':
117
+ load(data);
118
+ break;
119
+
120
+ case 'run':
121
+ run(data);
122
+ break;
123
+ }
124
+ });
whisper-speaker-diarization/tailwind.config.js ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /** @type {import('tailwindcss').Config} */
2
+ export default {
3
+ content: [
4
+ "./index.html",
5
+ "./src/**/*.{js,ts,jsx,tsx}",
6
+ ],
7
+ theme: {
8
+ extend: {},
9
+ },
10
+ plugins: [],
11
+ }
12
+
whisper-speaker-diarization/vite.config.js ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from 'vite'
2
+ import react from '@vitejs/plugin-react'
3
+
4
+ // https://vitejs.dev/config/
5
+ export default defineConfig({
6
+ plugins: [react()],
7
+ })