Spaces:
Paused
Paused
Update webscout.py
Browse files- webscout.py +189 -139
webscout.py
CHANGED
|
@@ -1176,145 +1176,229 @@ class WEBS:
|
|
| 1176 |
return results
|
| 1177 |
import requests
|
| 1178 |
import http.cookiejar as cookiejar
|
| 1179 |
-
import sys
|
| 1180 |
import json
|
| 1181 |
from xml.etree import ElementTree
|
| 1182 |
import re
|
| 1183 |
-
from requests import HTTPError
|
| 1184 |
import html.parser
|
|
|
|
| 1185 |
|
| 1186 |
html_parser = html.parser.HTMLParser()
|
| 1187 |
-
|
| 1188 |
|
| 1189 |
def unescape(string):
|
| 1190 |
return html.unescape(string)
|
| 1191 |
-
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
| 1192 |
|
| 1193 |
-
class TranscriptRetrievalError(Exception):
|
| 1194 |
-
"""
|
| 1195 |
-
Base class for exceptions raised when a transcript cannot be retrieved.
|
| 1196 |
-
"""
|
| 1197 |
-
ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
|
| 1198 |
-
CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
|
| 1199 |
-
CAUSE_MESSAGE = ''
|
| 1200 |
-
GITHUB_REFERRAL = (
|
| 1201 |
-
'\n\nIf you are sure that the described cause is not responsible for this error '
|
| 1202 |
-
'and that a transcript should be retrievable, please create an issue at '
|
| 1203 |
-
'https://github.com/OE-LUCIFER/Webscout/issues. '
|
| 1204 |
-
'Please add which version of webscout you are using '
|
| 1205 |
-
'and provide the information needed to replicate the error. '
|
| 1206 |
-
)
|
| 1207 |
|
| 1208 |
-
|
| 1209 |
-
self.video_id = video_id
|
| 1210 |
-
super(TranscriptRetrievalError, self).__init__(self._build_error_message())
|
| 1211 |
|
| 1212 |
-
def _build_error_message(self):
|
| 1213 |
-
cause = self.cause
|
| 1214 |
-
error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
|
| 1215 |
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
|
| 1219 |
-
|
|
|
|
|
|
|
| 1220 |
|
| 1221 |
-
@property
|
| 1222 |
-
def cause(self):
|
| 1223 |
-
return self.CAUSE_MESSAGE
|
| 1224 |
|
| 1225 |
class YouTubeRequestFailedError(TranscriptRetrievalError):
|
| 1226 |
-
|
| 1227 |
|
| 1228 |
def __init__(self, video_id, http_error):
|
| 1229 |
-
|
| 1230 |
-
super(
|
| 1231 |
|
| 1232 |
-
@property
|
| 1233 |
-
def cause(self):
|
| 1234 |
-
return self.CAUSE_MESSAGE.format(reason=self.reason)
|
| 1235 |
|
| 1236 |
class VideoUnavailableError(TranscriptRetrievalError):
|
| 1237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1238 |
|
| 1239 |
class InvalidVideoIdError(TranscriptRetrievalError):
|
| 1240 |
-
|
| 1241 |
-
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1245 |
|
| 1246 |
class TooManyRequestsError(TranscriptRetrievalError):
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
-
|
| 1250 |
-
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
-
|
|
|
|
|
|
|
|
|
|
| 1256 |
|
| 1257 |
class TranscriptsDisabledError(TranscriptRetrievalError):
|
| 1258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1259 |
|
| 1260 |
class NoTranscriptAvailableError(TranscriptRetrievalError):
|
| 1261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1262 |
|
| 1263 |
class NotTranslatableError(TranscriptRetrievalError):
|
| 1264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1265 |
|
| 1266 |
class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
|
| 1267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1268 |
|
| 1269 |
class CookiePathInvalidError(TranscriptRetrievalError):
|
| 1270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1271 |
|
| 1272 |
class CookiesInvalidError(TranscriptRetrievalError):
|
| 1273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1274 |
|
| 1275 |
class FailedToCreateConsentCookieError(TranscriptRetrievalError):
|
| 1276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1277 |
|
| 1278 |
class NoTranscriptFoundError(TranscriptRetrievalError):
|
| 1279 |
-
|
| 1280 |
-
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
| 1281 |
-
'{transcript_data}'
|
| 1282 |
-
)
|
| 1283 |
|
| 1284 |
def __init__(self, video_id, requested_language_codes, transcript_data):
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
-
|
| 1289 |
-
@property
|
| 1290 |
-
def cause(self):
|
| 1291 |
-
return self.CAUSE_MESSAGE.format(
|
| 1292 |
-
requested_language_codes=self._requested_language_codes,
|
| 1293 |
-
transcript_data=str(self._transcript_data),
|
| 1294 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1295 |
|
| 1296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1297 |
|
| 1298 |
-
|
| 1299 |
-
|
| 1300 |
-
|
| 1301 |
-
|
| 1302 |
-
|
| 1303 |
-
|
|
|
|
| 1304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1305 |
|
| 1306 |
-
|
| 1307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1308 |
self._http_client = http_client
|
| 1309 |
|
| 1310 |
-
def fetch(self, video_id):
|
|
|
|
| 1311 |
return TranscriptList.build(
|
| 1312 |
self._http_client,
|
| 1313 |
video_id,
|
| 1314 |
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
| 1315 |
)
|
| 1316 |
|
| 1317 |
-
def _extract_captions_json(self, html, video_id):
|
|
|
|
| 1318 |
splitted_html = html.split('"captions":')
|
| 1319 |
|
| 1320 |
if len(splitted_html) <= 1:
|
|
@@ -1358,11 +1442,8 @@ class TranscriptListFetcher(object):
|
|
| 1358 |
return unescape(_raise_http_errors(response, video_id).text)
|
| 1359 |
|
| 1360 |
|
| 1361 |
-
class TranscriptList
|
| 1362 |
-
"""
|
| 1363 |
-
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
| 1364 |
-
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
| 1365 |
-
"""
|
| 1366 |
|
| 1367 |
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
| 1368 |
"""
|
|
@@ -1434,18 +1515,18 @@ class TranscriptList(object):
|
|
| 1434 |
|
| 1435 |
def find_transcript(self, language_codes):
|
| 1436 |
"""
|
| 1437 |
-
Finds a transcript for a given language code.
|
| 1438 |
-
|
| 1439 |
-
`find_manually_created_transcript` instead.
|
| 1440 |
|
| 1441 |
-
:param language_codes: A list of language codes in a descending priority.
|
| 1442 |
-
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
| 1443 |
-
it fails to do so.
|
| 1444 |
:type languages: list[str]
|
| 1445 |
:return: the found Transcript
|
| 1446 |
:rtype Transcript:
|
| 1447 |
:raises: NoTranscriptFound
|
| 1448 |
"""
|
|
|
|
|
|
|
|
|
|
| 1449 |
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
| 1450 |
|
| 1451 |
def find_generated_transcript(self, language_codes):
|
|
@@ -1460,6 +1541,10 @@ class TranscriptList(object):
|
|
| 1460 |
:rtype Transcript:
|
| 1461 |
:raises: NoTranscriptFound
|
| 1462 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1463 |
return self._find_transcript(language_codes, [self._generated_transcripts])
|
| 1464 |
|
| 1465 |
def find_manually_created_transcript(self, language_codes):
|
|
@@ -1518,7 +1603,9 @@ class TranscriptList(object):
|
|
| 1518 |
return description if description else 'None'
|
| 1519 |
|
| 1520 |
|
| 1521 |
-
class Transcript
|
|
|
|
|
|
|
| 1522 |
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
| 1523 |
"""
|
| 1524 |
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
|
@@ -1555,7 +1642,7 @@ class Transcript(object):
|
|
| 1555 |
:rtype [{'text': str, 'start': float, 'end': float}]:
|
| 1556 |
"""
|
| 1557 |
response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
|
| 1558 |
-
return
|
| 1559 |
_raise_http_errors(response, self.video_id).text,
|
| 1560 |
)
|
| 1561 |
|
|
@@ -1588,7 +1675,8 @@ class Transcript(object):
|
|
| 1588 |
)
|
| 1589 |
|
| 1590 |
|
| 1591 |
-
class
|
|
|
|
| 1592 |
_FORMATTING_TAGS = [
|
| 1593 |
'strong', # important
|
| 1594 |
'em', # emphasized
|
|
@@ -1625,52 +1713,14 @@ class _TranscriptParser(object):
|
|
| 1625 |
if xml_element.text is not None
|
| 1626 |
]
|
| 1627 |
|
| 1628 |
-
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
| 1629 |
-
|
| 1630 |
-
class transcriber(object):
|
| 1631 |
-
@classmethod
|
| 1632 |
-
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
| 1633 |
-
with requests.Session() as http_client:
|
| 1634 |
-
if cookies:
|
| 1635 |
-
http_client.cookies = cls._load_cookies(cookies, video_id)
|
| 1636 |
-
http_client.proxies = proxies if proxies else {}
|
| 1637 |
-
return TranscriptListFetcher(http_client).fetch(video_id)
|
| 1638 |
-
|
| 1639 |
-
@classmethod
|
| 1640 |
-
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
|
| 1641 |
-
cookies=None, preserve_formatting=False):
|
| 1642 |
-
|
| 1643 |
-
assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
|
| 1644 |
-
|
| 1645 |
-
data = {}
|
| 1646 |
-
unretrievable_videos = []
|
| 1647 |
-
|
| 1648 |
-
for video_id in video_ids:
|
| 1649 |
-
try:
|
| 1650 |
-
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
|
| 1651 |
-
except Exception as exception:
|
| 1652 |
-
if not continue_after_error:
|
| 1653 |
-
raise exception
|
| 1654 |
-
|
| 1655 |
-
unretrievable_videos.append(video_id)
|
| 1656 |
|
| 1657 |
-
|
| 1658 |
-
|
| 1659 |
-
|
| 1660 |
-
|
| 1661 |
-
|
| 1662 |
-
|
| 1663 |
|
| 1664 |
-
@classmethod
|
| 1665 |
-
def _load_cookies(cls, cookies, video_id):
|
| 1666 |
-
try:
|
| 1667 |
-
cookie_jar = cookiejar.MozillaCookieJar()
|
| 1668 |
-
cookie_jar.load(cookies)
|
| 1669 |
-
if not cookie_jar:
|
| 1670 |
-
raise CookiesInvalidError(video_id)
|
| 1671 |
-
return cookie_jar
|
| 1672 |
-
except:
|
| 1673 |
-
raise CookiePathInvalidError(video_id)
|
| 1674 |
|
| 1675 |
class LLM:
|
| 1676 |
def __init__(self, model: str, system_message: str = "You are a Helpful AI."):
|
|
|
|
| 1176 |
return results
|
| 1177 |
import requests
|
| 1178 |
import http.cookiejar as cookiejar
|
|
|
|
| 1179 |
import json
|
| 1180 |
from xml.etree import ElementTree
|
| 1181 |
import re
|
|
|
|
| 1182 |
import html.parser
|
| 1183 |
+
from typing import List, Dict, Union, Optional
|
| 1184 |
|
| 1185 |
html_parser = html.parser.HTMLParser()
|
| 1186 |
+
|
| 1187 |
|
| 1188 |
def unescape(string):
|
| 1189 |
return html.unescape(string)
|
|
|
|
| 1190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1191 |
|
| 1192 |
+
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
|
|
|
|
|
|
| 1193 |
|
|
|
|
|
|
|
|
|
|
| 1194 |
|
| 1195 |
+
class TranscriptRetrievalError(Exception):
|
| 1196 |
+
"""Base class for transcript retrieval errors."""
|
| 1197 |
|
| 1198 |
+
def __init__(self, video_id, message):
|
| 1199 |
+
super().__init__(message.format(video_url=WATCH_URL.format(video_id=video_id)))
|
| 1200 |
+
self.video_id = video_id
|
| 1201 |
|
|
|
|
|
|
|
|
|
|
| 1202 |
|
| 1203 |
class YouTubeRequestFailedError(TranscriptRetrievalError):
|
| 1204 |
+
"""Raised when a request to YouTube fails."""
|
| 1205 |
|
| 1206 |
def __init__(self, video_id, http_error):
|
| 1207 |
+
message = 'Request to YouTube failed: {reason}'
|
| 1208 |
+
super().__init__(video_id, message.format(reason=str(http_error)))
|
| 1209 |
|
|
|
|
|
|
|
|
|
|
| 1210 |
|
| 1211 |
class VideoUnavailableError(TranscriptRetrievalError):
|
| 1212 |
+
"""Raised when the video is unavailable."""
|
| 1213 |
+
|
| 1214 |
+
def __init__(self, video_id):
|
| 1215 |
+
message = 'The video is no longer available'
|
| 1216 |
+
super().__init__(video_id, message)
|
| 1217 |
+
|
| 1218 |
|
| 1219 |
class InvalidVideoIdError(TranscriptRetrievalError):
|
| 1220 |
+
"""Raised when an invalid video ID is provided."""
|
| 1221 |
+
|
| 1222 |
+
def __init__(self, video_id):
|
| 1223 |
+
message = (
|
| 1224 |
+
'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n'
|
| 1225 |
+
'Do NOT run: `YTTranscriber.get_transcript("https://www.youtube.com/watch?v=1234")`\n'
|
| 1226 |
+
'Instead run: `YTTranscriber.get_transcript("1234")`'
|
| 1227 |
+
)
|
| 1228 |
+
super().__init__(video_id, message)
|
| 1229 |
+
|
| 1230 |
|
| 1231 |
class TooManyRequestsError(TranscriptRetrievalError):
|
| 1232 |
+
"""Raised when YouTube rate limits the requests."""
|
| 1233 |
+
|
| 1234 |
+
def __init__(self, video_id):
|
| 1235 |
+
message = (
|
| 1236 |
+
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
|
| 1237 |
+
'One of the following things can be done to work around this:\n\
|
| 1238 |
+
- Manually solve the captcha in a browser and export the cookie. '
|
| 1239 |
+
'- Use a different IP address\n\
|
| 1240 |
+
- Wait until the ban on your IP has been lifted'
|
| 1241 |
+
)
|
| 1242 |
+
super().__init__(video_id, message)
|
| 1243 |
+
|
| 1244 |
|
| 1245 |
class TranscriptsDisabledError(TranscriptRetrievalError):
|
| 1246 |
+
"""Raised when transcripts are disabled for the video."""
|
| 1247 |
+
|
| 1248 |
+
def __init__(self, video_id):
|
| 1249 |
+
message = 'Subtitles are disabled for this video'
|
| 1250 |
+
super().__init__(video_id, message)
|
| 1251 |
+
|
| 1252 |
|
| 1253 |
class NoTranscriptAvailableError(TranscriptRetrievalError):
|
| 1254 |
+
"""Raised when no transcripts are available for the video."""
|
| 1255 |
+
|
| 1256 |
+
def __init__(self, video_id):
|
| 1257 |
+
message = 'No transcripts are available for this video'
|
| 1258 |
+
super().__init__(video_id, message)
|
| 1259 |
+
|
| 1260 |
|
| 1261 |
class NotTranslatableError(TranscriptRetrievalError):
|
| 1262 |
+
"""Raised when the transcript is not translatable."""
|
| 1263 |
+
|
| 1264 |
+
def __init__(self, video_id):
|
| 1265 |
+
message = 'The requested language is not translatable'
|
| 1266 |
+
super().__init__(video_id, message)
|
| 1267 |
+
|
| 1268 |
|
| 1269 |
class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
|
| 1270 |
+
"""Raised when the requested translation language is not available."""
|
| 1271 |
+
|
| 1272 |
+
def __init__(self, video_id):
|
| 1273 |
+
message = 'The requested translation language is not available'
|
| 1274 |
+
super().__init__(video_id, message)
|
| 1275 |
+
|
| 1276 |
|
| 1277 |
class CookiePathInvalidError(TranscriptRetrievalError):
|
| 1278 |
+
"""Raised when the cookie path is invalid."""
|
| 1279 |
+
|
| 1280 |
+
def __init__(self, video_id):
|
| 1281 |
+
message = 'The provided cookie file was unable to be loaded'
|
| 1282 |
+
super().__init__(video_id, message)
|
| 1283 |
+
|
| 1284 |
|
| 1285 |
class CookiesInvalidError(TranscriptRetrievalError):
|
| 1286 |
+
"""Raised when the provided cookies are invalid."""
|
| 1287 |
+
|
| 1288 |
+
def __init__(self, video_id):
|
| 1289 |
+
message = 'The cookies provided are not valid (may have expired)'
|
| 1290 |
+
super().__init__(video_id, message)
|
| 1291 |
+
|
| 1292 |
|
| 1293 |
class FailedToCreateConsentCookieError(TranscriptRetrievalError):
|
| 1294 |
+
"""Raised when consent cookie creation fails."""
|
| 1295 |
+
|
| 1296 |
+
def __init__(self, video_id):
|
| 1297 |
+
message = 'Failed to automatically give consent to saving cookies'
|
| 1298 |
+
super().__init__(video_id, message)
|
| 1299 |
+
|
| 1300 |
|
| 1301 |
class NoTranscriptFoundError(TranscriptRetrievalError):
|
| 1302 |
+
"""Raised when no transcript is found for the requested language codes."""
|
|
|
|
|
|
|
|
|
|
| 1303 |
|
| 1304 |
def __init__(self, video_id, requested_language_codes, transcript_data):
|
| 1305 |
+
message = (
|
| 1306 |
+
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
| 1307 |
+
'{transcript_data}'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1308 |
)
|
| 1309 |
+
super().__init__(video_id, message.format(
|
| 1310 |
+
requested_language_codes=requested_language_codes,
|
| 1311 |
+
transcript_data=str(transcript_data)
|
| 1312 |
+
))
|
| 1313 |
|
| 1314 |
|
| 1315 |
+
class YTTranscriber:
|
| 1316 |
+
"""
|
| 1317 |
+
Main class for retrieving YouTube transcripts.
|
| 1318 |
+
"""
|
| 1319 |
|
| 1320 |
+
@staticmethod
|
| 1321 |
+
def get_transcript(video_url: str, languages: Optional[str] = 'en',
|
| 1322 |
+
proxies: Dict[str, str] = None,
|
| 1323 |
+
cookies: str = None,
|
| 1324 |
+
preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
|
| 1325 |
+
"""
|
| 1326 |
+
Retrieves the transcript for a given YouTube video URL.
|
| 1327 |
|
| 1328 |
+
Args:
|
| 1329 |
+
video_url (str): YouTube video URL (supports various formats).
|
| 1330 |
+
languages (str, optional): Language code for the transcript.
|
| 1331 |
+
If None, fetches the auto-generated transcript.
|
| 1332 |
+
Defaults to 'en'.
|
| 1333 |
+
proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
|
| 1334 |
+
cookies (str, optional): Path to the cookie file. Defaults to None.
|
| 1335 |
+
preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
|
| 1336 |
|
| 1337 |
+
Returns:
|
| 1338 |
+
List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
|
| 1339 |
+
- 'text': The transcribed text.
|
| 1340 |
+
- 'start': The start time of the text segment (in seconds).
|
| 1341 |
+
- 'duration': The duration of the text segment (in seconds).
|
| 1342 |
+
|
| 1343 |
+
Raises:
|
| 1344 |
+
TranscriptRetrievalError: If there's an error retrieving the transcript.
|
| 1345 |
+
"""
|
| 1346 |
+
video_id = YTTranscriber._extract_video_id(video_url)
|
| 1347 |
+
|
| 1348 |
+
with requests.Session() as http_client:
|
| 1349 |
+
if cookies:
|
| 1350 |
+
http_client.cookies = YTTranscriber._load_cookies(cookies, video_id)
|
| 1351 |
+
http_client.proxies = proxies if proxies else {}
|
| 1352 |
+
transcript_list_fetcher = TranscriptListFetcher(http_client)
|
| 1353 |
+
transcript_list = transcript_list_fetcher.fetch(video_id)
|
| 1354 |
+
|
| 1355 |
+
if languages is None: # Get auto-generated transcript
|
| 1356 |
+
return transcript_list.find_generated_transcript(['any']).fetch(
|
| 1357 |
+
preserve_formatting=preserve_formatting)
|
| 1358 |
+
else:
|
| 1359 |
+
return transcript_list.find_transcript([languages]).fetch(preserve_formatting=preserve_formatting)
|
| 1360 |
+
|
| 1361 |
+
@staticmethod
|
| 1362 |
+
def _extract_video_id(video_url: str) -> str:
|
| 1363 |
+
"""Extracts the video ID from different YouTube URL formats."""
|
| 1364 |
+
if 'youtube.com/watch?v=' in video_url:
|
| 1365 |
+
video_id = video_url.split('youtube.com/watch?v=')[1].split('&')[0]
|
| 1366 |
+
elif 'youtu.be/' in video_url:
|
| 1367 |
+
video_id = video_url.split('youtu.be/')[1].split('?')[0]
|
| 1368 |
+
else:
|
| 1369 |
+
raise InvalidVideoIdError(video_url)
|
| 1370 |
+
return video_id
|
| 1371 |
+
|
| 1372 |
+
@staticmethod
|
| 1373 |
+
def _load_cookies(cookies: str, video_id: str) -> cookiejar.MozillaCookieJar:
|
| 1374 |
+
"""Loads cookies from a file."""
|
| 1375 |
+
try:
|
| 1376 |
+
cookie_jar = cookiejar.MozillaCookieJar()
|
| 1377 |
+
cookie_jar.load(cookies)
|
| 1378 |
+
if not cookie_jar:
|
| 1379 |
+
raise CookiesInvalidError(video_id)
|
| 1380 |
+
return cookie_jar
|
| 1381 |
+
except:
|
| 1382 |
+
raise CookiePathInvalidError(video_id)
|
| 1383 |
+
|
| 1384 |
+
|
| 1385 |
+
class TranscriptListFetcher:
|
| 1386 |
+
"""Fetches the list of transcripts for a YouTube video."""
|
| 1387 |
+
|
| 1388 |
+
def __init__(self, http_client: requests.Session):
|
| 1389 |
+
"""Initializes TranscriptListFetcher."""
|
| 1390 |
self._http_client = http_client
|
| 1391 |
|
| 1392 |
+
def fetch(self, video_id: str):
|
| 1393 |
+
"""Fetches and returns a TranscriptList."""
|
| 1394 |
return TranscriptList.build(
|
| 1395 |
self._http_client,
|
| 1396 |
video_id,
|
| 1397 |
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
| 1398 |
)
|
| 1399 |
|
| 1400 |
+
def _extract_captions_json(self, html: str, video_id: str) -> dict:
|
| 1401 |
+
"""Extracts the captions JSON data from the video's HTML."""
|
| 1402 |
splitted_html = html.split('"captions":')
|
| 1403 |
|
| 1404 |
if len(splitted_html) <= 1:
|
|
|
|
| 1442 |
return unescape(_raise_http_errors(response, video_id).text)
|
| 1443 |
|
| 1444 |
|
| 1445 |
+
class TranscriptList:
|
| 1446 |
+
"""Represents a list of available transcripts."""
|
|
|
|
|
|
|
|
|
|
| 1447 |
|
| 1448 |
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
| 1449 |
"""
|
|
|
|
| 1515 |
|
| 1516 |
def find_transcript(self, language_codes):
|
| 1517 |
"""
|
| 1518 |
+
Finds a transcript for a given language code. If no language is provided, it will
|
| 1519 |
+
return the auto-generated transcript.
|
|
|
|
| 1520 |
|
| 1521 |
+
:param language_codes: A list of language codes in a descending priority.
|
|
|
|
|
|
|
| 1522 |
:type languages: list[str]
|
| 1523 |
:return: the found Transcript
|
| 1524 |
:rtype Transcript:
|
| 1525 |
:raises: NoTranscriptFound
|
| 1526 |
"""
|
| 1527 |
+
if 'any' in language_codes:
|
| 1528 |
+
for transcript in self:
|
| 1529 |
+
return transcript
|
| 1530 |
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
| 1531 |
|
| 1532 |
def find_generated_transcript(self, language_codes):
|
|
|
|
| 1541 |
:rtype Transcript:
|
| 1542 |
:raises: NoTranscriptFound
|
| 1543 |
"""
|
| 1544 |
+
if 'any' in language_codes:
|
| 1545 |
+
for transcript in self:
|
| 1546 |
+
if transcript.is_generated:
|
| 1547 |
+
return transcript
|
| 1548 |
return self._find_transcript(language_codes, [self._generated_transcripts])
|
| 1549 |
|
| 1550 |
def find_manually_created_transcript(self, language_codes):
|
|
|
|
| 1603 |
return description if description else 'None'
|
| 1604 |
|
| 1605 |
|
| 1606 |
+
class Transcript:
|
| 1607 |
+
"""Represents a single transcript."""
|
| 1608 |
+
|
| 1609 |
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
| 1610 |
"""
|
| 1611 |
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
|
|
|
| 1642 |
:rtype [{'text': str, 'start': float, 'end': float}]:
|
| 1643 |
"""
|
| 1644 |
response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
|
| 1645 |
+
return TranscriptParser(preserve_formatting=preserve_formatting).parse(
|
| 1646 |
_raise_http_errors(response, self.video_id).text,
|
| 1647 |
)
|
| 1648 |
|
|
|
|
| 1675 |
)
|
| 1676 |
|
| 1677 |
|
| 1678 |
+
class TranscriptParser:
|
| 1679 |
+
"""Parses the transcript data from XML."""
|
| 1680 |
_FORMATTING_TAGS = [
|
| 1681 |
'strong', # important
|
| 1682 |
'em', # emphasized
|
|
|
|
| 1713 |
if xml_element.text is not None
|
| 1714 |
]
|
| 1715 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1716 |
|
| 1717 |
+
def _raise_http_errors(response, video_id):
|
| 1718 |
+
try:
|
| 1719 |
+
response.raise_for_status()
|
| 1720 |
+
return response
|
| 1721 |
+
except requests.exceptions.HTTPError as error:
|
| 1722 |
+
raise YouTubeRequestFailedError(video_id, error)
|
| 1723 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1724 |
|
| 1725 |
class LLM:
|
| 1726 |
def __init__(self, model: str, system_message: str = "You are a Helpful AI."):
|