Spaces:
Runtime error
Runtime error
Joshua Lochner
commited on
Commit
·
3879103
1
Parent(s):
915339e
Improve preprocessing
Browse files- src/preprocess.py +190 -175
src/preprocess.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from datetime import datetime
|
| 2 |
import itertools
|
| 3 |
from typing import Optional, List
|
|
@@ -13,12 +16,12 @@ import re
|
|
| 13 |
import random
|
| 14 |
import logging
|
| 15 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 16 |
-
from youtube_transcript_api._errors import CouldNotRetrieveTranscript, YouTubeRequestFailed
|
| 17 |
import os
|
| 18 |
import json
|
| 19 |
import time
|
| 20 |
import requests
|
| 21 |
-
from utils import
|
| 22 |
|
| 23 |
|
| 24 |
def find(s, ch):
|
|
@@ -106,87 +109,84 @@ def get_auto_words(transcript_list):
|
|
| 106 |
return words
|
| 107 |
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
def get_words(video_id, process=True, fallback=True, transcript_type='auto'):
|
| 110 |
"""Get parsed video transcript with caching system
|
| 111 |
returns None if not processed yet and process is False
|
| 112 |
"""
|
| 113 |
get_manual_if_fail = fallback and transcript_type == 'auto'
|
| 114 |
-
transcript_path = os.path.join(
|
| 115 |
'transcripts', transcript_type, f'{video_id}.json')
|
| 116 |
words = []
|
| 117 |
try:
|
| 118 |
-
if os.path.exists(transcript_path):
|
| 119 |
with open(transcript_path) as fp:
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
if not wds and get_manual_if_fail:
|
| 123 |
-
return get_words(video_id, process, fallback, 'manual')
|
| 124 |
-
return wds
|
| 125 |
|
| 126 |
-
elif
|
| 127 |
-
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
else:
|
| 134 |
-
words = get_auto_words(transcript_list)
|
| 135 |
|
| 136 |
-
except YouTubeRequestFailed as e:
|
| 137 |
print(e)
|
| 138 |
-
time.sleep(
|
| 139 |
return get_words(video_id, process, fallback, transcript_type)
|
| 140 |
|
| 141 |
except CouldNotRetrieveTranscript:
|
| 142 |
-
if get_manual_if_fail:
|
| 143 |
-
print('fallback')
|
| 144 |
-
return get_words(video_id, process, fallback, 'manual')
|
| 145 |
-
|
| 146 |
-
except json.decoder.JSONDecodeError:
|
| 147 |
-
# Warning, unable to parse JSON
|
| 148 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
|
|
|
| 150 |
with open(transcript_path, 'w') as fp:
|
| 151 |
json.dump(words, fp)
|
| 152 |
|
|
|
|
|
|
|
|
|
|
| 153 |
return words
|
| 154 |
|
| 155 |
|
| 156 |
# TODO make min_sponsor_segment_length param
|
| 157 |
-
def extract_sponsors(words, min_sponsor_segment_length=
|
| 158 |
-
if
|
| 159 |
-
return []
|
| 160 |
|
| 161 |
paragraphs = []
|
| 162 |
current = []
|
| 163 |
prev_category = None
|
| 164 |
-
for word in words:
|
| 165 |
-
if word['category'] is None: # and not current:
|
| 166 |
-
continue # Skip unimportant
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
paragraphs.append({
|
| 172 |
-
'words': current,
|
| 173 |
-
'category': prev_category,
|
| 174 |
-
})
|
| 175 |
-
current = []
|
| 176 |
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
-
|
| 180 |
-
paragraphs.append({
|
| 181 |
-
'words': current,
|
| 182 |
-
'category': prev_category,
|
| 183 |
-
})
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
|
| 192 |
def clean_text(text):
|
|
@@ -231,33 +231,27 @@ def clean_text(text):
|
|
| 231 |
return text.strip()
|
| 232 |
|
| 233 |
|
| 234 |
-
def
|
| 235 |
-
"""Choose the best sponsor segment if overlapping with others"""
|
| 236 |
-
|
| 237 |
# Algorithm based on SponsorBlock algorithm
|
|
|
|
| 238 |
# Find sponsors that are overlapping
|
| 239 |
-
similar = []
|
| 240 |
-
for i in sponsor_segments:
|
| 241 |
-
for j in sponsor_segments:
|
| 242 |
-
# Since we do pairwise, we only check one direction
|
| 243 |
-
if (j['start'] >= i['start'] and j['start'] <= i['end']):
|
| 244 |
-
similar.append([i, j])
|
| 245 |
-
|
| 246 |
-
# Within each group, choose the segment with the most votes.
|
| 247 |
-
processed = []
|
| 248 |
-
best = []
|
| 249 |
-
for i in similar:
|
| 250 |
-
if i in processed:
|
| 251 |
-
continue
|
| 252 |
-
group = i
|
| 253 |
-
for j in similar:
|
| 254 |
-
if j[0] in group or j[1] in group: # If either in, append both
|
| 255 |
-
group.append(j[0])
|
| 256 |
-
group.append(j[1])
|
| 257 |
-
processed.append(j)
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
return best
|
| 263 |
|
|
@@ -280,16 +274,25 @@ class PreprocessArguments:
|
|
| 280 |
# Downvotes will make this negative.
|
| 281 |
# 1 = At least one positive vote
|
| 282 |
|
|
|
|
|
|
|
|
|
|
| 283 |
min_date: str = field(
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
|
|
|
| 286 |
categories: str = field(
|
| 287 |
default_factory=lambda: ['sponsor', 'selfpromo', 'interaction'],
|
| 288 |
metadata={
|
| 289 |
'nargs': '+',
|
| 290 |
-
'choices': ['intro', 'sponsor', 'interaction'
|
| 291 |
-
|
| 292 |
-
|
|
|
|
| 293 |
}
|
| 294 |
)
|
| 295 |
|
|
@@ -345,7 +348,7 @@ class PreprocessArguments:
|
|
| 345 |
)
|
| 346 |
|
| 347 |
min_wps: float = field(
|
| 348 |
-
default=
|
| 349 |
# 0.1 ~ 1%
|
| 350 |
# 0.4 ~ 2.5%
|
| 351 |
# 0.9 ~ 5%
|
|
@@ -357,7 +360,7 @@ MIRRORS = [
|
|
| 357 |
'https://sb-mirror.mchang.xyz/sponsorTimes.csv', # 5 minute delay
|
| 358 |
'https://sb.ltn.fi/database/sponsorTimes.csv', # 5 minute delay
|
| 359 |
]
|
| 360 |
-
# TODO only download latest
|
| 361 |
|
| 362 |
|
| 363 |
def download_file(url, filename):
|
|
@@ -480,7 +483,18 @@ def main():
|
|
| 480 |
raw_dataset_path = os.path.join(
|
| 481 |
preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
|
| 482 |
|
| 483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
latest_time = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
|
| 486 |
|
|
@@ -488,10 +502,9 @@ def main():
|
|
| 488 |
reader = csv.DictReader(csvfile)
|
| 489 |
|
| 490 |
for line in reader:
|
| 491 |
-
|
| 492 |
-
float(line['timeSubmitted'])/1e3)
|
| 493 |
|
| 494 |
-
if
|
| 495 |
continue
|
| 496 |
|
| 497 |
if line['service'] != 'YouTube':
|
|
@@ -499,7 +512,6 @@ def main():
|
|
| 499 |
if len(line['videoID']) != 11:
|
| 500 |
continue # Invalid youtube video ID
|
| 501 |
|
| 502 |
-
# TODO add support for other categories and action types?
|
| 503 |
if line['category'] not in preprocess_args.categories:
|
| 504 |
continue
|
| 505 |
if line['actionType'] != 'skip':
|
|
@@ -511,53 +523,72 @@ def main():
|
|
| 511 |
|
| 512 |
# Skip those that aren't highly voted
|
| 513 |
line['votes'] = int(line['votes'])
|
| 514 |
-
# incorrect_votes = int(line['incorrectVotes'])
|
| 515 |
-
|
| 516 |
if line['votes'] < preprocess_args.min_votes:
|
| 517 |
continue
|
| 518 |
|
| 519 |
-
|
| 520 |
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
|
| 529 |
# 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
|
| 530 |
# 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
|
| 531 |
# 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
|
| 532 |
-
|
| 533 |
if preprocess_args.do_transcribe:
|
| 534 |
print('Collecting videos')
|
| 535 |
-
|
| 536 |
-
data_rows = get_rows()
|
| 537 |
-
for row in data_rows:
|
| 538 |
-
video_ids.add(row['videoID'])
|
| 539 |
|
| 540 |
-
#
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
def on_job_complete(job):
|
| 545 |
-
progress.set_description(f'Processed {job.video_id}')
|
| 546 |
-
progress.update()
|
| 547 |
|
| 548 |
-
|
| 549 |
-
preprocess_args.num_jobs, on_job_complete=on_job_complete)
|
| 550 |
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
|
| 557 |
-
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
| 559 |
|
| 560 |
-
|
| 561 |
|
| 562 |
final_path = os.path.join(
|
| 563 |
processed_args.processed_dir, processed_args.processed_file)
|
|
@@ -567,56 +598,42 @@ def main():
|
|
| 567 |
|
| 568 |
final_data = {}
|
| 569 |
|
| 570 |
-
|
| 571 |
-
data_rows = get_rows()
|
| 572 |
-
# data_rows = itertools.islice(data_rows, 1000) # TODO temp
|
| 573 |
|
| 574 |
# TODO add progress bar
|
| 575 |
# TODO parallelise?
|
| 576 |
-
|
| 577 |
-
video_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
|
| 579 |
-
if video_id not in final_data:
|
| 580 |
final_data[video_id] = []
|
| 581 |
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
video_words = get_words(video_id, process=False)
|
| 586 |
-
if not video_words:
|
| 587 |
-
continue
|
| 588 |
-
|
| 589 |
-
segment_words = segment.extract_segment(
|
| 590 |
-
video_words, segment_start, segment_end)
|
| 591 |
-
|
| 592 |
-
if len(segment_words) <= 1:
|
| 593 |
-
continue # Useless to add segment since no words
|
| 594 |
-
|
| 595 |
-
# duration = segment.word_end(segment_words[-1]) - segment.word_start(segment_words[0])
|
| 596 |
-
duration = segment_end - segment_start
|
| 597 |
-
wps = len(segment_words)/duration if duration > 0 else 0
|
| 598 |
-
|
| 599 |
-
if wps < preprocess_args.min_wps:
|
| 600 |
-
print(index, 'Skipping bad segment in',
|
| 601 |
-
video_id, '| wps =', wps)
|
| 602 |
-
continue
|
| 603 |
-
|
| 604 |
-
final_data[video_id].append({
|
| 605 |
-
'start': segment_start,
|
| 606 |
-
'end': segment_end,
|
| 607 |
-
'votes': line['votes'],
|
| 608 |
-
'locked': line['locked'] == '1',
|
| 609 |
-
'views': line['views'],
|
| 610 |
-
'reputation': line['reputation'],
|
| 611 |
-
'category': line['category'],
|
| 612 |
-
'action': line['actionType'],
|
| 613 |
-
'uuid': line['UUID'],
|
| 614 |
-
})
|
| 615 |
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
|
| 621 |
# Save data
|
| 622 |
with open(final_path, 'w') as fp:
|
|
@@ -656,8 +673,9 @@ def main():
|
|
| 656 |
|
| 657 |
tokenizer = get_tokenizer(model_args)
|
| 658 |
|
| 659 |
-
|
| 660 |
-
|
|
|
|
| 661 |
|
| 662 |
write_mode = 'w' if preprocess_args.overwrite else 'a'
|
| 663 |
|
|
@@ -682,15 +700,15 @@ def main():
|
|
| 682 |
open(negative_file, write_mode, encoding='utf-8') as negative, \
|
| 683 |
tqdm(total=total) as progress:
|
| 684 |
|
| 685 |
-
for video_id, sponsor_segments in data:
|
| 686 |
index += 1 # TODO FIX index + incrementing
|
| 687 |
-
progress.set_description(f'Processing {video_id}')
|
| 688 |
|
| 689 |
-
if
|
| 690 |
-
progress.update()
|
| 691 |
-
elif count_videos >= preprocess_args.max_videos:
|
| 692 |
break
|
| 693 |
|
|
|
|
|
|
|
|
|
|
| 694 |
words = get_words(video_id, process=False)
|
| 695 |
if not words:
|
| 696 |
continue
|
|
@@ -707,16 +725,13 @@ def main():
|
|
| 707 |
if not segments:
|
| 708 |
continue
|
| 709 |
|
| 710 |
-
count_videos += 1
|
| 711 |
-
if not get_all:
|
| 712 |
-
progress.update()
|
| 713 |
-
|
| 714 |
for seg in segments:
|
| 715 |
duration = segment.word_end(
|
| 716 |
seg[-1]) - segment.word_start(seg[0])
|
| 717 |
wps = len(seg)/duration if duration > 0 else 0
|
| 718 |
|
| 719 |
# Ignore segments with "not enough words" in the transcript
|
|
|
|
| 720 |
if wps < preprocess_args.min_wps:
|
| 721 |
continue
|
| 722 |
|
|
@@ -732,13 +747,13 @@ def main():
|
|
| 732 |
if extracted_segments:
|
| 733 |
extracted_texts = []
|
| 734 |
for s in extracted_segments:
|
| 735 |
-
w = ' '.join(s['words'])
|
| 736 |
category = s['category'].upper()
|
|
|
|
|
|
|
| 737 |
|
| 738 |
-
|
| 739 |
-
extracted_texts
|
| 740 |
-
|
| 741 |
-
extracted_text = '\n'.join(extracted_texts)
|
| 742 |
|
| 743 |
d['extracted'] = clean_text(extracted_text)
|
| 744 |
print(json.dumps(d), file=positive)
|
|
|
|
| 1 |
+
from utils import jaccard
|
| 2 |
+
from shared import START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE
|
| 3 |
+
from functools import lru_cache
|
| 4 |
from datetime import datetime
|
| 5 |
import itertools
|
| 6 |
from typing import Optional, List
|
|
|
|
| 16 |
import random
|
| 17 |
import logging
|
| 18 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 19 |
+
from youtube_transcript_api._errors import CouldNotRetrieveTranscript, YouTubeRequestFailed, TooManyRequests
|
| 20 |
import os
|
| 21 |
import json
|
| 22 |
import time
|
| 23 |
import requests
|
| 24 |
+
from utils import Task, InterruptibleTaskPool
|
| 25 |
|
| 26 |
|
| 27 |
def find(s, ch):
|
|
|
|
| 109 |
return words
|
| 110 |
|
| 111 |
|
| 112 |
+
def list_transcripts(video_id):
|
| 113 |
+
return YouTubeTranscriptApi.list_transcripts(video_id)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@lru_cache(maxsize=16)
|
| 117 |
def get_words(video_id, process=True, fallback=True, transcript_type='auto'):
|
| 118 |
"""Get parsed video transcript with caching system
|
| 119 |
returns None if not processed yet and process is False
|
| 120 |
"""
|
| 121 |
get_manual_if_fail = fallback and transcript_type == 'auto'
|
| 122 |
+
transcript_path = os.path.join( # TODO use relative path to this
|
| 123 |
'transcripts', transcript_type, f'{video_id}.json')
|
| 124 |
words = []
|
| 125 |
try:
|
| 126 |
+
if os.path.exists(transcript_path): # Load from file
|
| 127 |
with open(transcript_path) as fp:
|
| 128 |
+
words = json.load(fp)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
+
elif process:
|
| 131 |
+
transcript_list = list_transcripts(video_id)
|
| 132 |
|
| 133 |
+
if transcript_type == 'manual':
|
| 134 |
+
words = get_manual_words(transcript_list)
|
| 135 |
+
else:
|
| 136 |
+
words = get_auto_words(transcript_list)
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
except (TooManyRequests, YouTubeRequestFailed, requests.exceptions.ConnectionError) as e: # Can retry
|
| 139 |
print(e)
|
| 140 |
+
time.sleep(10) # Timeout
|
| 141 |
return get_words(video_id, process, fallback, transcript_type)
|
| 142 |
|
| 143 |
except CouldNotRetrieveTranscript:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
pass
|
| 145 |
+
except json.decoder.JSONDecodeError:
|
| 146 |
+
print('JSONDecodeError for', video_id)
|
| 147 |
+
os.remove(transcript_path) # Remove file and try again
|
| 148 |
+
return get_words(video_id, process, fallback, transcript_type)
|
| 149 |
|
| 150 |
+
# Even save empty
|
| 151 |
with open(transcript_path, 'w') as fp:
|
| 152 |
json.dump(words, fp)
|
| 153 |
|
| 154 |
+
if not words and get_manual_if_fail:
|
| 155 |
+
return get_words(video_id, process, fallback, 'manual')
|
| 156 |
+
|
| 157 |
return words
|
| 158 |
|
| 159 |
|
| 160 |
# TODO make min_sponsor_segment_length param
|
| 161 |
+
def extract_sponsors(words, min_sponsor_segment_length=3):
|
| 162 |
+
if not words:
|
| 163 |
+
return []
|
| 164 |
|
| 165 |
paragraphs = []
|
| 166 |
current = []
|
| 167 |
prev_category = None
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
i = 0
|
| 170 |
+
while i <= len(words):
|
| 171 |
+
unimportant = i == len(words) or words[i]['category'] is None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
if unimportant or words[i]['category'] != prev_category:
|
| 174 |
+
if current: # Save the current batch
|
| 175 |
+
paragraphs.append({
|
| 176 |
+
'words': current,
|
| 177 |
+
'category': current[-1]['category'],
|
| 178 |
+
})
|
| 179 |
|
| 180 |
+
current = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
+
if not unimportant: # Some useful information to save
|
| 183 |
+
current.append(words[i])
|
| 184 |
+
prev_category = words[i]['category']
|
| 185 |
|
| 186 |
+
i += 1
|
| 187 |
+
|
| 188 |
+
# Remove all too short:
|
| 189 |
+
return list(filter(lambda x: len(x['words']) >= min_sponsor_segment_length, paragraphs))
|
| 190 |
|
| 191 |
|
| 192 |
def clean_text(text):
|
|
|
|
| 231 |
return text.strip()
|
| 232 |
|
| 233 |
|
| 234 |
+
def remove_duplicate_segments(segments):
|
|
|
|
|
|
|
| 235 |
# Algorithm based on SponsorBlock algorithm
|
| 236 |
+
# https://blog.ajay.app/voting-and-pseudo-randomness-or-sponsorblock-or-youtube-sponsorship-segment-blocker
|
| 237 |
# Find sponsors that are overlapping
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
+
best = []
|
| 240 |
+
for i in segments:
|
| 241 |
+
similar_segments = []
|
| 242 |
+
for j in segments:
|
| 243 |
+
if jaccard(i['start'], i['end'], j['start'], j['end']) > 0.1: # Some overlap
|
| 244 |
+
similar_segments.append(j)
|
| 245 |
+
|
| 246 |
+
if similar_segments:
|
| 247 |
+
best_similar_seg = max(similar_segments, key=lambda item: (
|
| 248 |
+
item['locked'],
|
| 249 |
+
item['votes'],
|
| 250 |
+
item['views'],
|
| 251 |
+
item['reputation']
|
| 252 |
+
))
|
| 253 |
+
if best_similar_seg not in best:
|
| 254 |
+
best.append(best_similar_seg)
|
| 255 |
|
| 256 |
return best
|
| 257 |
|
|
|
|
| 274 |
# Downvotes will make this negative.
|
| 275 |
# 1 = At least one positive vote
|
| 276 |
|
| 277 |
+
min_views: int = field(
|
| 278 |
+
default=5, metadata={'help': 'Minimum number of views a segment must have to be considered. 0 = show all'})
|
| 279 |
+
|
| 280 |
min_date: str = field(
|
| 281 |
+
# release of v2.0 (https://github.com/ajayyy/SponsorBlock/releases/tag/2.0)
|
| 282 |
+
default='08/06/2020',
|
| 283 |
+
# default='20/08/2021', # release of v3.0 (https://github.com/ajayyy/SponsorBlock/releases/tag/3.0)
|
| 284 |
+
# default='01/10/2020', # No more autovote
|
| 285 |
+
metadata={'help': 'Only use submissions from after this date'})
|
| 286 |
|
| 287 |
+
# TODO move?
|
| 288 |
categories: str = field(
|
| 289 |
default_factory=lambda: ['sponsor', 'selfpromo', 'interaction'],
|
| 290 |
metadata={
|
| 291 |
'nargs': '+',
|
| 292 |
+
'choices': ['intro', 'sponsor', 'interaction']
|
| 293 |
+
# 'outro', 'selfpromo', 'preview',
|
| 294 |
+
# 'poi_highlight', 'filler', 'music_offtopic',
|
| 295 |
+
# 'moreCategories'
|
| 296 |
}
|
| 297 |
)
|
| 298 |
|
|
|
|
| 348 |
)
|
| 349 |
|
| 350 |
min_wps: float = field(
|
| 351 |
+
default=1.5, metadata={'help': 'Ignore videos with not enough words spoken per second. This is usually indicitive of video whose captions aren\'t English.'})
|
| 352 |
# 0.1 ~ 1%
|
| 353 |
# 0.4 ~ 2.5%
|
| 354 |
# 0.9 ~ 5%
|
|
|
|
| 360 |
'https://sb-mirror.mchang.xyz/sponsorTimes.csv', # 5 minute delay
|
| 361 |
'https://sb.ltn.fi/database/sponsorTimes.csv', # 5 minute delay
|
| 362 |
]
|
| 363 |
+
# TODO only download latest updates/changes
|
| 364 |
|
| 365 |
|
| 366 |
def download_file(url, filename):
|
|
|
|
| 483 |
raw_dataset_path = os.path.join(
|
| 484 |
preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
|
| 485 |
|
| 486 |
+
if preprocess_args.update_database:
|
| 487 |
+
print('Updating database')
|
| 488 |
+
for mirror in MIRRORS:
|
| 489 |
+
print('Downloading from', mirror)
|
| 490 |
+
if download_file(mirror, raw_dataset_path):
|
| 491 |
+
break
|
| 492 |
+
print('Failed, trying next')
|
| 493 |
+
|
| 494 |
+
@lru_cache
|
| 495 |
+
def read_db(): # TODO save as file
|
| 496 |
+
print('Parsing raw database')
|
| 497 |
+
db = {}
|
| 498 |
|
| 499 |
latest_time = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
|
| 500 |
|
|
|
|
| 502 |
reader = csv.DictReader(csvfile)
|
| 503 |
|
| 504 |
for line in reader:
|
| 505 |
+
submission_time = float(line['timeSubmitted'])/1e3
|
|
|
|
| 506 |
|
| 507 |
+
if datetime.fromtimestamp(submission_time) < latest_time:
|
| 508 |
continue
|
| 509 |
|
| 510 |
if line['service'] != 'YouTube':
|
|
|
|
| 512 |
if len(line['videoID']) != 11:
|
| 513 |
continue # Invalid youtube video ID
|
| 514 |
|
|
|
|
| 515 |
if line['category'] not in preprocess_args.categories:
|
| 516 |
continue
|
| 517 |
if line['actionType'] != 'skip':
|
|
|
|
| 523 |
|
| 524 |
# Skip those that aren't highly voted
|
| 525 |
line['votes'] = int(line['votes'])
|
|
|
|
|
|
|
| 526 |
if line['votes'] < preprocess_args.min_votes:
|
| 527 |
continue
|
| 528 |
|
| 529 |
+
locked = line['locked'] == '1'
|
| 530 |
|
| 531 |
+
# Skip segments with low views (i.e., not really reviewed)
|
| 532 |
+
# Always include segments locked by VIPs, regardless of view count
|
| 533 |
+
line['views'] = int(line['views'])
|
| 534 |
+
if not locked and line['views'] < preprocess_args.min_views:
|
| 535 |
+
continue
|
| 536 |
+
|
| 537 |
+
if line['videoID'] not in db:
|
| 538 |
+
db[line['videoID']] = []
|
| 539 |
+
|
| 540 |
+
db[line['videoID']].append({
|
| 541 |
+
'uuid': line['UUID'],
|
| 542 |
+
'start': float(line['startTime']),
|
| 543 |
+
'end': float(line['endTime']),
|
| 544 |
+
'votes': line['votes'],
|
| 545 |
+
'locked': locked,
|
| 546 |
+
'views': line['views'],
|
| 547 |
+
'submission_time': submission_time,
|
| 548 |
+
'reputation': line['reputation'],
|
| 549 |
+
'category': line['category'],
|
| 550 |
+
'action': line['actionType'],
|
| 551 |
+
})
|
| 552 |
+
|
| 553 |
+
num_segments = 0
|
| 554 |
+
|
| 555 |
+
# Remove duplicate sponsor segments by choosing best (most votes)
|
| 556 |
+
print('Remove duplicate segments')
|
| 557 |
+
for key in db:
|
| 558 |
+
db[key] = remove_duplicate_segments(db[key])
|
| 559 |
+
num_segments += len(db[key])
|
| 560 |
+
print('Saved', len(db), 'videos and', num_segments, 'segments')
|
| 561 |
+
|
| 562 |
+
return db
|
| 563 |
|
| 564 |
# 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
|
| 565 |
# 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
|
| 566 |
# 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
|
| 567 |
+
parsed_database = None
|
| 568 |
if preprocess_args.do_transcribe:
|
| 569 |
print('Collecting videos')
|
| 570 |
+
parsed_database = read_db()
|
|
|
|
|
|
|
|
|
|
| 571 |
|
| 572 |
+
# Remove transcripts already processed
|
| 573 |
+
finished = set(os.listdir('transcripts/auto/') +
|
| 574 |
+
os.listdir('transcripts/manual/'))
|
| 575 |
+
finished = set([x.split('.')[0] for x in finished])
|
|
|
|
|
|
|
|
|
|
| 576 |
|
| 577 |
+
video_ids = list(parsed_database.keys() - finished)
|
|
|
|
| 578 |
|
| 579 |
+
# Create tasks generator
|
| 580 |
+
tasks = (
|
| 581 |
+
Task(get_words, video_id)
|
| 582 |
+
for video_id in video_ids
|
| 583 |
+
)
|
| 584 |
|
| 585 |
+
print('start')
|
| 586 |
+
with tqdm(total=len(video_ids)) as progress:
|
| 587 |
+
def callback(task):
|
| 588 |
+
progress.set_description(f'Processing {task.args[0]}')
|
| 589 |
+
progress.update()
|
| 590 |
|
| 591 |
+
InterruptibleTaskPool(tasks, preprocess_args.num_jobs, callback).start()
|
| 592 |
|
| 593 |
final_path = os.path.join(
|
| 594 |
processed_args.processed_dir, processed_args.processed_file)
|
|
|
|
| 598 |
|
| 599 |
final_data = {}
|
| 600 |
|
| 601 |
+
parsed_database = read_db()
|
|
|
|
|
|
|
| 602 |
|
| 603 |
# TODO add progress bar
|
| 604 |
# TODO parallelise?
|
| 605 |
+
with tqdm(total=len(parsed_database)) as progress:
|
| 606 |
+
for index, (video_id, segments) in enumerate(parsed_database.items()):
|
| 607 |
+
|
| 608 |
+
if preprocess_args.max_videos is not None and index >= preprocess_args.max_videos:
|
| 609 |
+
break
|
| 610 |
+
progress.set_description(f'Processing {video_id}')
|
| 611 |
+
progress.update()
|
| 612 |
|
|
|
|
| 613 |
final_data[video_id] = []
|
| 614 |
|
| 615 |
+
video_words = get_words(video_id, process=False)
|
| 616 |
+
if not video_words:
|
| 617 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
+
for seg in segments: # Only add segments with high enough wps
|
| 620 |
+
segment_words = segment.extract_segment(
|
| 621 |
+
video_words, seg['start'], seg['end'])
|
| 622 |
+
|
| 623 |
+
if len(segment_words) <= 1:
|
| 624 |
+
continue # Useless to add segment since no words
|
| 625 |
+
|
| 626 |
+
# duration = segment.word_end(segment_words[-1]) - segment.word_start(segment_words[0])
|
| 627 |
+
duration = seg['end'] - seg['start']
|
| 628 |
+
wps = len(segment_words)/duration if duration > 0 else 0
|
| 629 |
+
|
| 630 |
+
# print(video_id, wps)
|
| 631 |
+
if wps < preprocess_args.min_wps:
|
| 632 |
+
# Skip sponsor segments without many words
|
| 633 |
+
# e.g. music ads with some words on each side
|
| 634 |
+
# progress.set_description(f'Skipping bad segment in {video_id} (wps={wps})')
|
| 635 |
+
continue
|
| 636 |
+
final_data[video_id].append(seg)
|
| 637 |
|
| 638 |
# Save data
|
| 639 |
with open(final_path, 'w') as fp:
|
|
|
|
| 673 |
|
| 674 |
tokenizer = get_tokenizer(model_args)
|
| 675 |
|
| 676 |
+
# TODO
|
| 677 |
+
# count_videos = 0
|
| 678 |
+
# count_segments = 0
|
| 679 |
|
| 680 |
write_mode = 'w' if preprocess_args.overwrite else 'a'
|
| 681 |
|
|
|
|
| 700 |
open(negative_file, write_mode, encoding='utf-8') as negative, \
|
| 701 |
tqdm(total=total) as progress:
|
| 702 |
|
| 703 |
+
for ind, (video_id, sponsor_segments) in enumerate(data):
|
| 704 |
index += 1 # TODO FIX index + incrementing
|
|
|
|
| 705 |
|
| 706 |
+
if preprocess_args.max_videos is not None and ind >= preprocess_args.max_videos:
|
|
|
|
|
|
|
| 707 |
break
|
| 708 |
|
| 709 |
+
progress.set_description(f'Processing {video_id}')
|
| 710 |
+
progress.update()
|
| 711 |
+
|
| 712 |
words = get_words(video_id, process=False)
|
| 713 |
if not words:
|
| 714 |
continue
|
|
|
|
| 725 |
if not segments:
|
| 726 |
continue
|
| 727 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 728 |
for seg in segments:
|
| 729 |
duration = segment.word_end(
|
| 730 |
seg[-1]) - segment.word_start(seg[0])
|
| 731 |
wps = len(seg)/duration if duration > 0 else 0
|
| 732 |
|
| 733 |
# Ignore segments with "not enough words" in the transcript
|
| 734 |
+
# Must do here since this includes non-sponsor segments
|
| 735 |
if wps < preprocess_args.min_wps:
|
| 736 |
continue
|
| 737 |
|
|
|
|
| 747 |
if extracted_segments:
|
| 748 |
extracted_texts = []
|
| 749 |
for s in extracted_segments:
|
| 750 |
+
w = ' '.join([q['text'] for q in s['words']])
|
| 751 |
category = s['category'].upper()
|
| 752 |
+
extracted_texts.append(
|
| 753 |
+
f"{START_SEGMENT_TEMPLATE.format(category)} {w} {END_SEGMENT_TEMPLATE.format(category)}")
|
| 754 |
|
| 755 |
+
extracted_text = f' {CustomTokens.BETWEEN_SEGMENTS.value} '.join(
|
| 756 |
+
extracted_texts)
|
|
|
|
|
|
|
| 757 |
|
| 758 |
d['extracted'] = clean_text(extracted_text)
|
| 759 |
print(json.dumps(d), file=positive)
|