Spaces:

Yannael
/

video-chaptering

Running

App Files Files Community

Yannael_LB commited on Sep 7, 2024

Commit

12360eb

1 Parent(s): 8282d2e

Update

Browse files

Files changed (4) hide show

app.py +29 -138
ErnWZxJovaM.json → examples/ErnWZxJovaM.json +0 -0
examples/ErnWZxJovaM_transcript.json +0 -0
utils.py +17 -272

app.py CHANGED Viewed

@@ -1,114 +1,31 @@
 import gradio as gr
 import os
-from youtube_transcript_api import YouTubeTranscriptApi
 import utils
-from openai import OpenAI
-from groq import Groq
-from dotenv import load_dotenv
-load_dotenv()
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-#import importlib
-#importlib.reload(utils)
-def get_llm_client_and_model(llm_model):
-    if llm_model == "llama3-8b":
-        llm_client = Groq(api_key=GROQ_API_KEY)
-        llm_model = 'llama3-8b-8192'
-    elif llm_model == "gpt-4o-mini":
-        llm_client = OpenAI(api_key=OPENAI_API_KEY)
-        llm_model = 'gpt-4o-mini-2024-07-18'
-    return llm_client, llm_model
-def gradio_process_video(video_id,
-                         model_format_transcript, model_toc,
-                         chunk_size_format_transcript, chunk_size_toc,
-                         progress=gr.Progress()):
-    if video_id in ["ErnWZxJovaM"]:
-        chapters = utils.load_json_chapters(video_id)
-    else:
-        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
-        chunk_size_format_transcript = int(chunk_size_format_transcript)
-        llm_client_format_transcript, llm_model_format_transcript = \
-            get_llm_client_and_model(model_format_transcript)
-        paragraphs, nb_input_tokens, nb_output_tokens, price = \
-            utils.transcript_to_paragraphs(transcript, \
-                                           llm_client_format_transcript, llm_model_format_transcript, \
-                                           chunk_size=chunk_size_format_transcript, progress=progress)
-        paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)
-        chunk_size_toc = int(chunk_size_toc)
-        llm_client_get_toc, llm_model_get_toc = \
-            get_llm_client_and_model(model_toc)
-        json_toc, nb_input_tokens, nb_output_tokens, price = \
-            utils.paragraphs_to_toc(paragraphs, \
-                                    llm_client_get_toc, llm_model_get_toc, \
-                                    chunk_size=chunk_size_toc)
-        chapters = utils.get_chapters(paragraphs, json_toc)
-    output_html = utils.get_result_as_html(chapters, video_id)
-    return {output_processing: str(output_html),
-            gv_output: output_html}
-def gradio_process_video(video_id,
-                         model_format_transcript, model_toc,
-                         chunk_size_format_transcript, chunk_size_toc,
-                         progress=gr.Progress()):
-    if video_id in ["ErnWZxJovaM"]:
-        chapters = utils.load_json_chapters(video_id)
-    else:
-        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
-        chunk_size_format_transcript = int(chunk_size_format_transcript)
-        llm_client_format_transcript, llm_model_format_transcript = \
-            get_llm_client_and_model(model_format_transcript)
-        paragraphs, nb_input_tokens, nb_output_tokens, price = \
-            utils.transcript_to_paragraphs(transcript, \
-                                           llm_client_format_transcript, llm_model_format_transcript, \
-                                           chunk_size=chunk_size_format_transcript, progress=progress)
-        paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)
-        chunk_size_toc = int(chunk_size_toc)
-        llm_client_get_toc, llm_model_get_toc = \
-            get_llm_client_and_model(model_toc)
-        json_toc, nb_input_tokens, nb_output_tokens, price = \
-            utils.paragraphs_to_toc(paragraphs, \
-                                    llm_client_get_toc, llm_model_get_toc, \
-                                    chunk_size=chunk_size_toc)
-        chapters = utils.get_chapters(paragraphs, json_toc)
     output_html = utils.get_result_as_html(chapters, video_id)
-    return {output_processing: str(output_html),
-            gv_output: output_html}
 # %%
@@ -123,58 +40,32 @@ css = """
 }
 """
-example_video_id = "ErnWZxJovaM"
-example_chapters = utils.load_json_chapters(example_video_id)
-example_output_html = utils.get_result_as_html(example_chapters, example_video_id)
 with (gr.Blocks(css=css) as app):
-    gr.HTML("<div align='center'><h1 class='header'>Demo: Automatic video chaptering with LLMs and TF-IDF</h1></div>")
-    gr.HTML("<div align='center'><h3 class='header'>From raw transcript to structured document</h3></div>")
     gr.HTML("<hr>")
-    gr.Markdown("""This demo relies on
-                - Groq's Llama 3 8B for transcript preprocessing
-                - OpenAI's GPT-4o-mini for chaptering. Note: Using GPT-4o-mini for transcript preprocessing will improve results, but takes longer (around 2/3 minutes for a one-hour video)
-                The following YouTube video ID are already preprocessed (copy and paste ID in box below):
-                - `ErnWZxJovaM`: [MIT course](https://www.youtube.com/watch?v=ErnWZxJovaM)
-                - `EuC1GWhQdKE`: [Anthropic](https://www.youtube.com/watch?v=EuC1GWhQdKE)
-                Check the [Medium article]() for more details"""
-                )
-    gv_transcript = gr.State()
-    video_id_input = gr.Textbox(label="Enter YouTube Video ID", value="EuC1GWhQdKE")
-    with gr.Accordion("Set parameters", open=False):
-        with gr.Row():
-            with gr.Column(scale=1):
-                model_format_transcript = gr.Dropdown(
-                    [("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
-                    label="Transcript preprocessing", value="llama3-8b", interactive=True)
-                chunk_size_format_transcript = gr.Textbox(label="Preprocessing chunk size", value=2000)
-            with gr.Column(scale=1):
-                model_toc = gr.Dropdown([("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
-                                        label="Chaptering", value="gpt-4o-mini", interactive=True)
-                chunk_size_toc = gr.Textbox(label="Chaptering chunk size", value=30)
-            with gr.Column(scale=1):
-                api_key_openai = gr.Textbox(label="OpenAI API Key", value="xxx")
-                api_key_groq = gr.Textbox(label="Groq API Key", value="xxx")
-    processing_button = gr.Button("Process transcript")
-    gv_output = gr.State()
     gr.HTML("<hr>")
     output_processing = gr.HTML(label="Output processing", value=example_output_html)
-    processing_button.click(gradio_process_video,
-                            inputs=[video_id_input,
-                                    model_format_transcript, model_toc,
-                                    chunk_size_format_transcript, chunk_size_toc],
-                            outputs=[output_processing, gv_output])
     # gr.HTML(result_as_html)

 import gradio as gr
 import os
 import utils
+example_video_id = "ErnWZxJovaM"
+example_output_transcript = utils.load_transcript(example_video_id)
+example_chapters = utils.load_json_chapters(example_video_id)
+example_output_html = utils.get_result_as_html(example_chapters, example_video_id)
+example_video_id_dict = {"MIT Introduction to Deep Learning | 6.S191 - Alexander Amini": "ErnWZxJovaM",
+                         "dog": "b8HO6hba9ZE",
+                         "bird": "EuC1GWhQdKE"}
+example_video_names = list(example_video_id_dict.keys())
+def gradio_load_example(example_video):
+    video_id = example_video_id_dict[example_video]
+    transcript_as_text = utils.load_transcript(video_id)
+    chapters = utils.load_json_chapters(video_id)
     output_html = utils.get_result_as_html(chapters, video_id)
+    return {output_processing: output_html,
+            output_transcript: transcript_as_text}
 # %%
 }
 """
 with (gr.Blocks(css=css) as app):
+    gr.HTML("<div align='center'><h1>Demo: Automatic video chaptering with LLMs and TF-IDF</h1></div>")
+    gr.HTML("<div align='center'><h2>From raw transcript to structured document</h2></div>")
+    gr.HTML("<div align='center'><h3>See the companion <a href=''>Medium article</a> and <a href=''>Github repository</a> for more details</h3>")
     gr.HTML("<hr>")
+    #gv_transcript = gr.State()
+    video_id_input = gr.Dropdown(choices=example_video_names,
+                                 label="Choose a video to see the structured transcript",
+                                 value=example_video_names[0])
+    load_button = gr.Button("Load example")
+    #gv_output = gr.State()
     gr.HTML("<hr>")
+    with gr.Accordion("See raw transcript", open=False):
+        output_transcript = gr.Textbox(value=example_output_transcript, max_lines=10, lines=10, label="Raw transcript")
     output_processing = gr.HTML(label="Output processing", value=example_output_html)
+    load_button.click(gradio_load_example,
+                            inputs=[video_id_input],
+                            outputs=[output_processing, output_transcript])
     # gr.HTML(result_as_html)

ErnWZxJovaM.json → examples/ErnWZxJovaM.json RENAMED Viewed

File without changes

examples/ErnWZxJovaM_transcript.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py CHANGED Viewed

@@ -1,282 +1,13 @@
 import json
 import re
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-########################### LLM call ###########################
-price_token={'gpt-4o': {'input': 5/1000000, 'output': 15/1000000},
-             'gpt-4o-2024-08-06': {'input': 2.5/1000000, 'output': 10/1000000},
-             'gpt-4o-mini-2024-07-18': {'input': 0.15/1000000, 'output': 0.6/1000000},
-             'llama3-8b-8192' : {'input': 0.05 / 1000000, 'output': 0.08 / 1000000},
-             'llama3-70b-8192' : {'input': 0.59 / 1000000, 'output': 0.79 / 1000000},
-             'claude-3-5-sonnet-20240620': {'input': 3/1000000, 'output': 15/1000000},
-             'claude-3-haiku-20240307': {'input': 0.25/1000000, 'output': 1.25/1000000},
-             }
-def call_llm(client, model, system_prompt, prompt,
-             temperature=0, seed=42, response_format=None, max_tokens=5000):
-    response = client.chat.completions.create(
-        messages=[
-            {
-                "role": "system",
-                "content": system_prompt
-            },
-            {
-                "role": "user",
-                "content": prompt
-            }
-        ],
-        model=model,
-        temperature=temperature,
-        seed=seed,
-        response_format=response_format,
-        max_tokens=max_tokens
-    )
-    nb_input_tokens = response.usage.prompt_tokens
-    nb_output_tokens = response.usage.completion_tokens
-    price = nb_input_tokens * price_token[model]['input'] + nb_output_tokens * price_token[model]['output']
-    print(f"input tokens: {nb_input_tokens}; output tokens: {nb_output_tokens}, price: {price}")
-    response_content=response.choices[0].message.content
-    return response_content, nb_input_tokens, nb_output_tokens, price
-########################### Step 2: Transcript to paragraph ###########################
-system_prompt_transcript_to_paragraphs = f"""
-You are a helpful assistant.
-Your task is to improve the user input's readability: add punctuation if needed, remove verbal tics, correct grammatical errors, and add appropriate line breaks with '\n\n'.
-Put your answer within <answer></answer> tags.
-"""
-def transcript_to_paragraphs(transcript, llm_client, llm_model, chunk_size=5000, progress=None):
-    transcript_as_text = ' '.join([s['text'] for s in transcript])
-    paragraphs = []
-    last_paragraph = ""
-    total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0
-    nb_chunks = int(len(transcript_as_text) / chunk_size) + 1
-    progress_i = 0
-    print(f"Number of chunks: {nb_chunks}")
-    # for i in range(0, 10000, chunk_size):
-    for i in range(0, len(transcript_as_text), chunk_size):
-        print("i is: " + str(i))
-        chunk = last_paragraph + " " + transcript_as_text[i:i + chunk_size]
-        if progress is not None:
-            progress_i += 1
-            progress(progress_i / nb_chunks, desc="Processing")
-        found_edited_transcript = False
-        while not found_edited_transcript:
-            response_content, nb_input_tokens, nb_output_tokens, price = \
-                call_llm(llm_client, llm_model,
-                         system_prompt=system_prompt_transcript_to_paragraphs, prompt=chunk,
-                         temperature=0.2, seed=42, response_format=None)
-            if not "</answer>" in response_content:
-                response_content += "</answer>"
-            # Extract content from <edited_transcript> tags
-            pattern = re.compile(r'<answer>(.*?)</answer>', re.DOTALL)
-            response_content_edited = pattern.findall(response_content)
-            if len(response_content_edited) > 0:
-                found_edited_transcript = True
-                response_content_edited = response_content_edited[0]
-            else:
-                print("No edited transcript found. Trying again.")
-                print(response_content[0:100])
-                print(response_content[-100:])
-        total_nb_input_tokens += nb_input_tokens
-        total_nb_output_tokens += nb_output_tokens
-        total_price += price
-        paragraphs_chunk = response_content_edited.strip().split('\n\n')
-        print('Found paragraphs:', len(paragraphs_chunk))
-        last_paragraph = paragraphs_chunk[-1]
-        paragraphs += paragraphs_chunk[:-1]
-    paragraphs += [last_paragraph]
-    paragraphs_dict = [{'paragraph_number': i, 'paragraph_text': paragraph} for i, paragraph in enumerate(paragraphs)]
-    return paragraphs_dict, total_nb_input_tokens, total_nb_output_tokens, total_price
-########################### Step 3: Infer timestamps ###########################
-def transform_text_segments(text_segments, num_words=50):
-    # Initialize variables
-    transformed_segments = []
-    current_index = 0
-    num_segments = len(text_segments)
-    for i in range(num_segments):
-        current_index = i
-        # Get the current segment's starting timestamp and text
-        current_segment = text_segments[current_index]
-        current_text = current_segment['text']
-        # Initialize a list to hold the combined text
-        combined_text = " ".join(current_text.split()[:num_words])
-        number_words_collected = len(current_text.split())
-        # Collect words from subsequent segments
-        while number_words_collected < num_words and (current_index + 1) < num_segments:
-            current_index += 1
-            next_segment = text_segments[current_index]
-            next_text = next_segment['text']
-            next_words = next_text.split()
-            # Append words from the next segment
-            if number_words_collected + len(next_words) <= num_words:
-                combined_text += ' ' + next_text
-                number_words_collected += len(next_words)
-            else:
-                # Only append enough words to reach the num_words limit
-                words_needed = num_words - number_words_collected
-                combined_text += ' ' + ' '.join(next_words[:words_needed])
-                number_words_collected = num_words
-        # Append the combined segment to the result
-        transformed_segments.append(combined_text)
-    return transformed_segments
-def add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50):
-    list_indices = []
-    transcript_num_words = transform_text_segments(transcript, num_words=num_words)
-    paragraphs_start_text = [{"start": p['paragraph_number'], "text": p['paragraph_text']} for p in paragraphs]
-    paragraphs_num_words = transform_text_segments(paragraphs_start_text, num_words=num_words)
-    # Create a TF-IDF vectorizer
-    vectorizer = TfidfVectorizer().fit_transform(transcript_num_words + paragraphs_num_words)
-    # Get the TF-IDF vectors for the transcript and the excerpt
-    vectors = vectorizer.toarray()
-    for i in range(len(paragraphs_num_words)):
-        # Extract the TF-IDF vector for the paragraph
-        paragraph_vector = vectors[len(transcript_num_words) + i]
-        # Calculate the cosine similarity between the paragraph vector and each transcript chunk
-        similarities = cosine_similarity(vectors[:len(transcript_num_words)], paragraph_vector.reshape(1, -1))
-        # Find the index of the most similar chunk
-        best_match_index = int(np.argmax(similarities))
-        list_indices.append(best_match_index)
-        paragraphs[i]['matched_index'] = best_match_index
-        paragraphs[i]['matched_text'] = transcript[best_match_index]['text']
-        paragraphs[i]['start_time'] = int(transcript[best_match_index]['start']) - 2
-        if paragraphs[i]['start_time'] < 0:
-            paragraphs[i]['start_time'] = 0
-    return paragraphs
-########################### Step 4: Generate table of content ###########################
-system_prompt_paragraphs_to_toc = """
-	You are a helpful assistant.
-	You are given a transcript of a course in JSON format as a list of paragraphs, each containing 'paragraph_number' and 'paragraph_text' keys.
-	Your task is to group consecutive paragraphs in chapters for the course and identify meaningful chapter titles.
-	Here are the steps to follow:
-1. Read the transcript carefully to understand its general structure and the main topics covered.
-2. Look for clues that a new chapter is about to start. This could be a change of topic, a change of time or setting, the introduction of new themes or topics, or the speaker's explicit mention of a new part.
-3. For each chapter, keep track of the paragraph number that starts the chapter and identify a meaningful chapter title.
-4. Chapters should ideally be equally spaced throughout the transcript, and discuss a specific topic.
-5. A chapter MUST have more than 4 paragraphs.
-	Format your result in JSON, with a list dictionaries for chapters, with 'start_paragraph_number':integer and 'title':string as key:value.
-	Example:
-    {"chapters":
-        [{"start_paragraph_number": 0, "title": "Introduction"},
-         {"start_paragraph_number": 10, "title": "Chapter 1"}
-        ]
-    }
-"""
-def paragraphs_to_toc(paragraphs, llm_client, llm_model, chunk_size=100):
-    chapters = []
-    number_last_chapter = 0
-    total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0
-    while number_last_chapter < len(paragraphs):
-        print(number_last_chapter)
-        chunk = paragraphs[number_last_chapter:(number_last_chapter + chunk_size)]
-        chunk = [{'paragraph_number': p['paragraph_number'], 'paragraph_text': p['paragraph_text']} for p in chunk]
-        chunk_json_dump = json.dumps(chunk)
-        content, nb_input_tokens, nb_output_tokens, price = call_llm( \
-            llm_client, llm_model, \
-            system_prompt_paragraphs_to_toc, chunk_json_dump, \
-            temperature=0, seed=42, response_format={"type": "json_object"})
-        total_nb_input_tokens += nb_input_tokens
-        total_nb_output_tokens += nb_output_tokens
-        chapters_chunk = json.loads(content)['chapters']
-        if number_last_chapter == chapters_chunk[-1]['start_paragraph_number']:
-            break
-        chapters += chapters_chunk[:-1]
-        number_last_chapter = chapters_chunk[-1]['start_paragraph_number']
-        if number_last_chapter >= len(paragraphs) - 5:
-            break
-    total_price = (total_nb_input_tokens * price_token[llm_model]['input'] +
-                   total_nb_output_tokens * price_token[llm_model]['output'])
-    chapters += [chapters_chunk[-1]]
-    return chapters, total_nb_input_tokens, total_nb_output_tokens, total_price
-########################### Step 5: Chapter rendering functions ###########################
 def get_chapters(paragraphs, table_of_content):
@@ -417,8 +148,22 @@ def get_result_as_html(chapters, video_id):
     return result_as_html
 def load_json_chapters(video_id):
-    file_name = f"{video_id}.json"
     with open(file_name, 'r') as file:
         chapters = json.load(file)

 import json
 import re
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+########################### Chapter rendering functions ###########################
 def get_chapters(paragraphs, table_of_content):
     return result_as_html
+def get_transcript_as_text(transcript):
+    temp_list = [convert_seconds_to_hms(int(s['start']))+' '+s['text'] for s in transcript]
+    transcript_as_text = '\n'.join(temp_list)
+    return transcript_as_text
+def load_transcript(video_id):
+    file_name = f"examples/{video_id}_transcript.json"
+    with open(file_name, 'r') as file:
+        transcript = json.load(file)
+    transcript_as_text = get_transcript_as_text(transcript)
+    return transcript_as_text
 def load_json_chapters(video_id):
+    file_name = f"examples/{video_id}.json"
     with open(file_name, 'r') as file:
         chapters = json.load(file)