Kathir0011 commited on
Commit
541a629
·
verified ·
1 Parent(s): f647d99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -14
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
- import os, re
 
3
 
4
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
5
  from youtube_transcript_api import YouTubeTranscriptApi
@@ -13,11 +14,24 @@ from langchain.prompts.chat import (
13
  )
14
 
15
 
16
- def get_video_id(youtube_url):
17
- """Extracts the video ID from a YouTube URL."""
18
- pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
19
- match = re.search(pattern, youtube_url)
20
- return match.group(1) if match else None
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  def create_db_from_video_url(video_url, api_key):
@@ -25,17 +39,13 @@ def create_db_from_video_url(video_url, api_key):
25
  Creates an Embedding of the Video and performs
26
  """
27
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
28
-
29
- video_id = get_video_id(video_url)
30
- if not video_id:
31
- return "Invalid YouTube URL!"
32
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
33
- text = "\n".join([t["text"] for t in transcript])
34
- print(text)
35
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
36
 
37
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
38
- docs = text_splitter.split_documents(text)
39
  print(docs)
40
 
41
  db = FAISS.from_documents(docs, embedding=embeddings)
 
1
  import gradio as gr
2
+ import os
3
+ import yt_dlp
4
 
5
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
6
  from youtube_transcript_api import YouTubeTranscriptApi
 
14
  )
15
 
16
 
17
+ def get_transcript_yt_dlp(video_url):
18
+ """Fetches transcript using yt_dlp."""
19
+ ydl_opts = {
20
+ "writesubtitles": True,
21
+ "writeautomaticsub": True,
22
+ "skip_download": True,
23
+ "subtitleslangs": ["en"], # Fetch English subtitles
24
+ }
25
+
26
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
27
+ info_dict = ydl.extract_info(video_url, download=False)
28
+ subtitles = info_dict.get("subtitles") or info_dict.get("automatic_captions")
29
+
30
+ if subtitles and "en" in subtitles:
31
+ sub_url = subtitles["en"][0]["url"]
32
+ return f"Transcript URL: {sub_url}"
33
+ else:
34
+ return "No subtitles available!"
35
 
36
 
37
  def create_db_from_video_url(video_url, api_key):
 
39
  Creates an Embedding of the Video and performs
40
  """
41
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
42
+
43
+ transcripts = get_transcript_yt_dlp(video_url)
44
+ print(transcripts)
 
 
 
 
45
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
46
 
47
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
48
+ docs = text_splitter.split_documents(transcripts)
49
  print(docs)
50
 
51
  db = FAISS.from_documents(docs, embedding=embeddings)