Spaces:

Kathir0011
/

YouTube_Video_Assistant

Running

Kathir0011 commited on Jan 29

Commit

541a629

verified ·

1 Parent(s): f647d99

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
-import os, re
 from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
 from youtube_transcript_api import YouTubeTranscriptApi
@@ -13,11 +14,24 @@ from langchain.prompts.chat import (
 )
-def get_video_id(youtube_url):
-    """Extracts the video ID from a YouTube URL."""
-    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
-    match = re.search(pattern, youtube_url)
-    return match.group(1) if match else None
 def create_db_from_video_url(video_url, api_key):
@@ -25,17 +39,13 @@ def create_db_from_video_url(video_url, api_key):
     Creates an Embedding of the Video and performs
     """
     embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
-    video_id = get_video_id(video_url)
-    if not video_id:
-        return "Invalid YouTube URL!"
-    transcript = YouTubeTranscriptApi.get_transcript(video_id)
-    text = "\n".join([t["text"] for t in transcript])
-    print(text)
     # cannot provide this directly to the model so we are splitting the transcripts into small chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    docs = text_splitter.split_documents(text)
     print(docs)
     db = FAISS.from_documents(docs, embedding=embeddings)

 import gradio as gr
+import os
+import yt_dlp
 from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
 from youtube_transcript_api import YouTubeTranscriptApi
 )
+def get_transcript_yt_dlp(video_url):
+    """Fetches transcript using yt_dlp."""
+    ydl_opts = {
+        "writesubtitles": True,
+        "writeautomaticsub": True,
+        "skip_download": True,
+        "subtitleslangs": ["en"],  # Fetch English subtitles
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info_dict = ydl.extract_info(video_url, download=False)
+        subtitles = info_dict.get("subtitles") or info_dict.get("automatic_captions")
+        if subtitles and "en" in subtitles:
+            sub_url = subtitles["en"][0]["url"]
+            return f"Transcript URL: {sub_url}"
+        else:
+            return "No subtitles available!"
 def create_db_from_video_url(video_url, api_key):
     Creates an Embedding of the Video and performs
     """
     embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
+    transcripts = get_transcript_yt_dlp(video_url)
+    print(transcripts)
     # cannot provide this directly to the model so we are splitting the transcripts into small chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    docs = text_splitter.split_documents(transcripts)
     print(docs)
     db = FAISS.from_documents(docs, embedding=embeddings)