Spaces:
Build error
Build error
Refactor pad and merge timestamps into one function
Browse filesThis also fixes a bunch of issues regarding when the timestamps
should be merged.
- src/segments.py +47 -0
- src/vad.py +6 -66
- tests/segments_test.py +48 -0
src/segments.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List
|
| 2 |
+
|
| 3 |
+
import copy
|
| 4 |
+
|
| 5 |
+
def merge_timestamps(timestamps: List[Dict[str, Any]], merge_window: float = 5, max_merge_size: float = 30, padding_left: float = 1, padding_right: float = 1):
|
| 6 |
+
result = []
|
| 7 |
+
|
| 8 |
+
if len(timestamps) == 0:
|
| 9 |
+
return result
|
| 10 |
+
|
| 11 |
+
processed_time = 0
|
| 12 |
+
current_segment = None
|
| 13 |
+
|
| 14 |
+
for i in range(len(timestamps)):
|
| 15 |
+
next_segment = timestamps[i]
|
| 16 |
+
|
| 17 |
+
delta = next_segment['start'] - processed_time
|
| 18 |
+
|
| 19 |
+
# Note that segments can still be longer than the max merge size, they just won't be merged in that case
|
| 20 |
+
if current_segment is None or delta > merge_window or next_segment['end'] - current_segment['start'] > max_merge_size:
|
| 21 |
+
# Finish the current segment
|
| 22 |
+
if current_segment is not None:
|
| 23 |
+
# Add right padding
|
| 24 |
+
finish_padding = min(padding_right, delta / 2) if delta < padding_left + padding_right else padding_right
|
| 25 |
+
current_segment['end'] += finish_padding
|
| 26 |
+
delta -= finish_padding
|
| 27 |
+
|
| 28 |
+
result.append(current_segment)
|
| 29 |
+
|
| 30 |
+
# Start a new segment
|
| 31 |
+
current_segment = copy.deepcopy(next_segment)
|
| 32 |
+
|
| 33 |
+
# Pad the segment
|
| 34 |
+
current_segment['start'] = current_segment['start'] - min(padding_left, delta)
|
| 35 |
+
processed_time = current_segment['end']
|
| 36 |
+
|
| 37 |
+
else:
|
| 38 |
+
# Merge the segment
|
| 39 |
+
current_segment['end'] = next_segment['end']
|
| 40 |
+
processed_time = current_segment['end']
|
| 41 |
+
|
| 42 |
+
# Add the last segment
|
| 43 |
+
if current_segment is not None:
|
| 44 |
+
current_segment['end'] += padding_right
|
| 45 |
+
result.append(current_segment)
|
| 46 |
+
|
| 47 |
+
return result
|
src/vad.py
CHANGED
|
@@ -5,6 +5,8 @@ from typing import Any, Deque, Iterator, List, Dict
|
|
| 5 |
|
| 6 |
from pprint import pprint
|
| 7 |
|
|
|
|
|
|
|
| 8 |
# Workaround for https://github.com/tensorflow/tensorflow/issues/48797
|
| 9 |
try:
|
| 10 |
import tensorflow as tf
|
|
@@ -110,8 +112,10 @@ class AbstractTranscription(ABC):
|
|
| 110 |
# get speech timestamps from full audio file
|
| 111 |
seconds_timestamps = self.get_transcribe_timestamps(audio)
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
| 115 |
|
| 116 |
# A deque of transcribed segments that is passed to the next segment as a prompt
|
| 117 |
prompt_window = deque()
|
|
@@ -346,70 +350,6 @@ class AbstractTranscription(ABC):
|
|
| 346 |
result.append(new_segment)
|
| 347 |
return result
|
| 348 |
|
| 349 |
-
def pad_timestamps(self, timestamps: List[Dict[str, Any]], padding_left: float, padding_right: float):
|
| 350 |
-
if (padding_left == 0 and padding_right == 0):
|
| 351 |
-
return timestamps
|
| 352 |
-
|
| 353 |
-
result = []
|
| 354 |
-
prev_entry = None
|
| 355 |
-
|
| 356 |
-
for i in range(len(timestamps)):
|
| 357 |
-
curr_entry = timestamps[i]
|
| 358 |
-
next_entry = timestamps[i + 1] if i < len(timestamps) - 1 else None
|
| 359 |
-
|
| 360 |
-
segment_start = curr_entry['start']
|
| 361 |
-
segment_end = curr_entry['end']
|
| 362 |
-
|
| 363 |
-
if padding_left is not None:
|
| 364 |
-
segment_start = max(prev_entry['end'] if prev_entry else 0, segment_start - padding_left)
|
| 365 |
-
if padding_right is not None:
|
| 366 |
-
segment_end = segment_end + padding_right
|
| 367 |
-
|
| 368 |
-
# Do not pad past the next segment
|
| 369 |
-
if (next_entry is not None):
|
| 370 |
-
segment_end = min(next_entry['start'], segment_end)
|
| 371 |
-
|
| 372 |
-
new_entry = { 'start': segment_start, 'end': segment_end }
|
| 373 |
-
prev_entry = new_entry
|
| 374 |
-
result.append(new_entry)
|
| 375 |
-
|
| 376 |
-
return result
|
| 377 |
-
|
| 378 |
-
def merge_timestamps(self, timestamps: List[Dict[str, Any]], max_merge_gap: float, max_merge_size: float,
|
| 379 |
-
min_force_merge_gap: float, max_force_merge_size: float):
|
| 380 |
-
if max_merge_gap is None:
|
| 381 |
-
return timestamps
|
| 382 |
-
|
| 383 |
-
result = []
|
| 384 |
-
current_entry = None
|
| 385 |
-
|
| 386 |
-
for entry in timestamps:
|
| 387 |
-
if current_entry is None:
|
| 388 |
-
current_entry = entry
|
| 389 |
-
continue
|
| 390 |
-
|
| 391 |
-
# Get distance to the previous entry
|
| 392 |
-
distance = entry['start'] - current_entry['end']
|
| 393 |
-
current_entry_size = current_entry['end'] - current_entry['start']
|
| 394 |
-
|
| 395 |
-
if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
|
| 396 |
-
# Regular merge
|
| 397 |
-
current_entry['end'] = entry['end']
|
| 398 |
-
elif min_force_merge_gap is not None and distance <= min_force_merge_gap and \
|
| 399 |
-
(max_force_merge_size is None or current_entry_size <= max_force_merge_size):
|
| 400 |
-
# Force merge if the distance is small (up to a certain maximum size)
|
| 401 |
-
current_entry['end'] = entry['end']
|
| 402 |
-
else:
|
| 403 |
-
# Output current entry
|
| 404 |
-
result.append(current_entry)
|
| 405 |
-
current_entry = entry
|
| 406 |
-
|
| 407 |
-
# Add final entry
|
| 408 |
-
if current_entry is not None:
|
| 409 |
-
result.append(current_entry)
|
| 410 |
-
|
| 411 |
-
return result
|
| 412 |
-
|
| 413 |
def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
|
| 414 |
result = []
|
| 415 |
|
|
|
|
| 5 |
|
| 6 |
from pprint import pprint
|
| 7 |
|
| 8 |
+
from src.segments import merge_timestamps
|
| 9 |
+
|
| 10 |
# Workaround for https://github.com/tensorflow/tensorflow/issues/48797
|
| 11 |
try:
|
| 12 |
import tensorflow as tf
|
|
|
|
| 112 |
# get speech timestamps from full audio file
|
| 113 |
seconds_timestamps = self.get_transcribe_timestamps(audio)
|
| 114 |
|
| 115 |
+
#for seconds_timestamp in seconds_timestamps:
|
| 116 |
+
# print("VAD timestamp ", format_timestamp(seconds_timestamp['start']), " to ", format_timestamp(seconds_timestamp['end']))
|
| 117 |
+
|
| 118 |
+
merged = merge_timestamps(seconds_timestamps, self.max_silent_period, self.max_merge_size, self.segment_padding_left, self.segment_padding_right)
|
| 119 |
|
| 120 |
# A deque of transcribed segments that is passed to the next segment as a prompt
|
| 121 |
prompt_window = deque()
|
|
|
|
| 350 |
result.append(new_segment)
|
| 351 |
return result
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
|
| 354 |
result = []
|
| 355 |
|
tests/segments_test.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import unittest
|
| 3 |
+
|
| 4 |
+
sys.path.append('../whisper-webui')
|
| 5 |
+
|
| 6 |
+
from src.segments import merge_timestamps
|
| 7 |
+
|
| 8 |
+
class TestSegments(unittest.TestCase):
|
| 9 |
+
def __init__(self, *args, **kwargs):
|
| 10 |
+
super(TestSegments, self).__init__(*args, **kwargs)
|
| 11 |
+
|
| 12 |
+
def test_merge_segments(self):
|
| 13 |
+
segments = [
|
| 14 |
+
{'start': 10.0, 'end': 20.0},
|
| 15 |
+
{'start': 22.0, 'end': 27.0},
|
| 16 |
+
{'start': 31.0, 'end': 35.0},
|
| 17 |
+
{'start': 45.0, 'end': 60.0},
|
| 18 |
+
{'start': 61.0, 'end': 65.0},
|
| 19 |
+
{'start': 68.0, 'end': 98.0},
|
| 20 |
+
{'start': 100.0, 'end': 102.0},
|
| 21 |
+
{'start': 110.0, 'end': 112.0}
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
result = merge_timestamps(segments, merge_window=5, max_merge_size=30, padding_left=1, padding_right=1)
|
| 25 |
+
|
| 26 |
+
self.assertListEqual(result, [
|
| 27 |
+
{'start': 9.0, 'end': 36.0},
|
| 28 |
+
{'start': 44.0, 'end': 66.0},
|
| 29 |
+
{'start': 67.0, 'end': 99.0},
|
| 30 |
+
{'start': 99.0, 'end': 103.0},
|
| 31 |
+
{'start': 109.0, 'end': 113.0}
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
def test_overlap_next(self):
|
| 35 |
+
segments = [
|
| 36 |
+
{'start': 5.0, 'end': 39.182},
|
| 37 |
+
{'start': 39.986, 'end': 40.814}
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
result = merge_timestamps(segments, merge_window=5, max_merge_size=30, padding_left=1, padding_right=1)
|
| 41 |
+
|
| 42 |
+
self.assertListEqual(result, [
|
| 43 |
+
{'start': 4.0, 'end': 39.584},
|
| 44 |
+
{'start': 39.584, 'end': 41.814}
|
| 45 |
+
])
|
| 46 |
+
|
| 47 |
+
if __name__ == '__main__':
|
| 48 |
+
unittest.main()
|