Yannael_LB commited on
Commit
12360eb
·
1 Parent(s): 8282d2e
app.py CHANGED
@@ -1,114 +1,31 @@
1
  import gradio as gr
2
  import os
3
 
4
- from youtube_transcript_api import YouTubeTranscriptApi
5
-
6
  import utils
7
 
8
- from openai import OpenAI
9
- from groq import Groq
10
-
11
- from dotenv import load_dotenv
12
- load_dotenv()
13
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
14
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
-
16
-
17
- #import importlib
18
- #importlib.reload(utils)
19
-
20
- def get_llm_client_and_model(llm_model):
21
- if llm_model == "llama3-8b":
22
- llm_client = Groq(api_key=GROQ_API_KEY)
23
- llm_model = 'llama3-8b-8192'
24
-
25
- elif llm_model == "gpt-4o-mini":
26
- llm_client = OpenAI(api_key=OPENAI_API_KEY)
27
- llm_model = 'gpt-4o-mini-2024-07-18'
28
-
29
- return llm_client, llm_model
30
-
31
-
32
- def gradio_process_video(video_id,
33
- model_format_transcript, model_toc,
34
- chunk_size_format_transcript, chunk_size_toc,
35
- progress=gr.Progress()):
36
- if video_id in ["ErnWZxJovaM"]:
37
- chapters = utils.load_json_chapters(video_id)
38
-
39
- else:
40
-
41
- transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
42
-
43
- chunk_size_format_transcript = int(chunk_size_format_transcript)
44
-
45
- llm_client_format_transcript, llm_model_format_transcript = \
46
- get_llm_client_and_model(model_format_transcript)
47
-
48
- paragraphs, nb_input_tokens, nb_output_tokens, price = \
49
- utils.transcript_to_paragraphs(transcript, \
50
- llm_client_format_transcript, llm_model_format_transcript, \
51
- chunk_size=chunk_size_format_transcript, progress=progress)
52
-
53
- paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)
54
-
55
- chunk_size_toc = int(chunk_size_toc)
56
-
57
- llm_client_get_toc, llm_model_get_toc = \
58
- get_llm_client_and_model(model_toc)
59
-
60
- json_toc, nb_input_tokens, nb_output_tokens, price = \
61
- utils.paragraphs_to_toc(paragraphs, \
62
- llm_client_get_toc, llm_model_get_toc, \
63
- chunk_size=chunk_size_toc)
64
-
65
- chapters = utils.get_chapters(paragraphs, json_toc)
66
-
67
- output_html = utils.get_result_as_html(chapters, video_id)
68
-
69
- return {output_processing: str(output_html),
70
- gv_output: output_html}
71
-
72
-
73
- def gradio_process_video(video_id,
74
- model_format_transcript, model_toc,
75
- chunk_size_format_transcript, chunk_size_toc,
76
- progress=gr.Progress()):
77
- if video_id in ["ErnWZxJovaM"]:
78
- chapters = utils.load_json_chapters(video_id)
79
-
80
- else:
81
-
82
- transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
83
-
84
- chunk_size_format_transcript = int(chunk_size_format_transcript)
85
-
86
- llm_client_format_transcript, llm_model_format_transcript = \
87
- get_llm_client_and_model(model_format_transcript)
88
 
89
- paragraphs, nb_input_tokens, nb_output_tokens, price = \
90
- utils.transcript_to_paragraphs(transcript, \
91
- llm_client_format_transcript, llm_model_format_transcript, \
92
- chunk_size=chunk_size_format_transcript, progress=progress)
93
 
94
- paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)
 
 
95
 
96
- chunk_size_toc = int(chunk_size_toc)
97
 
98
- llm_client_get_toc, llm_model_get_toc = \
99
- get_llm_client_and_model(model_toc)
100
 
101
- json_toc, nb_input_tokens, nb_output_tokens, price = \
102
- utils.paragraphs_to_toc(paragraphs, \
103
- llm_client_get_toc, llm_model_get_toc, \
104
- chunk_size=chunk_size_toc)
105
 
106
- chapters = utils.get_chapters(paragraphs, json_toc)
107
 
 
108
  output_html = utils.get_result_as_html(chapters, video_id)
109
 
110
- return {output_processing: str(output_html),
111
- gv_output: output_html}
112
 
113
 
114
  # %%
@@ -123,58 +40,32 @@ css = """
123
  }
124
  """
125
 
126
- example_video_id = "ErnWZxJovaM"
127
- example_chapters = utils.load_json_chapters(example_video_id)
128
- example_output_html = utils.get_result_as_html(example_chapters, example_video_id)
129
-
130
  with (gr.Blocks(css=css) as app):
131
- gr.HTML("<div align='center'><h1 class='header'>Demo: Automatic video chaptering with LLMs and TF-IDF</h1></div>")
132
- gr.HTML("<div align='center'><h3 class='header'>From raw transcript to structured document</h3></div>")
 
133
  gr.HTML("<hr>")
134
- gr.Markdown("""This demo relies on
135
- - Groq's Llama 3 8B for transcript preprocessing
136
- - OpenAI's GPT-4o-mini for chaptering. Note: Using GPT-4o-mini for transcript preprocessing will improve results, but takes longer (around 2/3 minutes for a one-hour video)
137
 
138
- The following YouTube video ID are already preprocessed (copy and paste ID in box below):
139
 
140
- - `ErnWZxJovaM`: [MIT course](https://www.youtube.com/watch?v=ErnWZxJovaM)
141
- - `EuC1GWhQdKE`: [Anthropic](https://www.youtube.com/watch?v=EuC1GWhQdKE)
 
142
 
143
- Check the [Medium article]() for more details"""
144
- )
145
 
146
- gv_transcript = gr.State()
147
-
148
- video_id_input = gr.Textbox(label="Enter YouTube Video ID", value="EuC1GWhQdKE")
149
-
150
- with gr.Accordion("Set parameters", open=False):
151
- with gr.Row():
152
- with gr.Column(scale=1):
153
- model_format_transcript = gr.Dropdown(
154
- [("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
155
- label="Transcript preprocessing", value="llama3-8b", interactive=True)
156
- chunk_size_format_transcript = gr.Textbox(label="Preprocessing chunk size", value=2000)
157
- with gr.Column(scale=1):
158
- model_toc = gr.Dropdown([("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
159
- label="Chaptering", value="gpt-4o-mini", interactive=True)
160
- chunk_size_toc = gr.Textbox(label="Chaptering chunk size", value=30)
161
- with gr.Column(scale=1):
162
- api_key_openai = gr.Textbox(label="OpenAI API Key", value="xxx")
163
- api_key_groq = gr.Textbox(label="Groq API Key", value="xxx")
164
-
165
- processing_button = gr.Button("Process transcript")
166
-
167
- gv_output = gr.State()
168
 
169
  gr.HTML("<hr>")
170
 
 
 
 
171
  output_processing = gr.HTML(label="Output processing", value=example_output_html)
172
 
173
- processing_button.click(gradio_process_video,
174
- inputs=[video_id_input,
175
- model_format_transcript, model_toc,
176
- chunk_size_format_transcript, chunk_size_toc],
177
- outputs=[output_processing, gv_output])
178
 
179
  # gr.HTML(result_as_html)
180
 
 
1
  import gradio as gr
2
  import os
3
 
 
 
4
  import utils
5
 
6
+ example_video_id = "ErnWZxJovaM"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ example_output_transcript = utils.load_transcript(example_video_id)
9
+ example_chapters = utils.load_json_chapters(example_video_id)
10
+ example_output_html = utils.get_result_as_html(example_chapters, example_video_id)
 
11
 
12
+ example_video_id_dict = {"MIT Introduction to Deep Learning | 6.S191 - Alexander Amini": "ErnWZxJovaM",
13
+ "dog": "b8HO6hba9ZE",
14
+ "bird": "EuC1GWhQdKE"}
15
 
16
+ example_video_names = list(example_video_id_dict.keys())
17
 
18
+ def gradio_load_example(example_video):
 
19
 
20
+ video_id = example_video_id_dict[example_video]
 
 
 
21
 
22
+ transcript_as_text = utils.load_transcript(video_id)
23
 
24
+ chapters = utils.load_json_chapters(video_id)
25
  output_html = utils.get_result_as_html(chapters, video_id)
26
 
27
+ return {output_processing: output_html,
28
+ output_transcript: transcript_as_text}
29
 
30
 
31
  # %%
 
40
  }
41
  """
42
 
 
 
 
 
43
  with (gr.Blocks(css=css) as app):
44
+ gr.HTML("<div align='center'><h1>Demo: Automatic video chaptering with LLMs and TF-IDF</h1></div>")
45
+ gr.HTML("<div align='center'><h2>From raw transcript to structured document</h2></div>")
46
+ gr.HTML("<div align='center'><h3>See the companion <a href=''>Medium article</a> and <a href=''>Github repository</a> for more details</h3>")
47
  gr.HTML("<hr>")
 
 
 
48
 
49
+ #gv_transcript = gr.State()
50
 
51
+ video_id_input = gr.Dropdown(choices=example_video_names,
52
+ label="Choose a video to see the structured transcript",
53
+ value=example_video_names[0])
54
 
55
+ load_button = gr.Button("Load example")
 
56
 
57
+ #gv_output = gr.State()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  gr.HTML("<hr>")
60
 
61
+ with gr.Accordion("See raw transcript", open=False):
62
+ output_transcript = gr.Textbox(value=example_output_transcript, max_lines=10, lines=10, label="Raw transcript")
63
+
64
  output_processing = gr.HTML(label="Output processing", value=example_output_html)
65
 
66
+ load_button.click(gradio_load_example,
67
+ inputs=[video_id_input],
68
+ outputs=[output_processing, output_transcript])
 
 
69
 
70
  # gr.HTML(result_as_html)
71
 
ErnWZxJovaM.json → examples/ErnWZxJovaM.json RENAMED
File without changes
examples/ErnWZxJovaM_transcript.json ADDED
The diff for this file is too large to render. See raw diff
 
utils.py CHANGED
@@ -1,282 +1,13 @@
1
  import json
2
  import re
3
 
 
4
  import numpy as np
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
 
8
 
9
- ########################### LLM call ###########################
10
-
11
- price_token={'gpt-4o': {'input': 5/1000000, 'output': 15/1000000},
12
- 'gpt-4o-2024-08-06': {'input': 2.5/1000000, 'output': 10/1000000},
13
- 'gpt-4o-mini-2024-07-18': {'input': 0.15/1000000, 'output': 0.6/1000000},
14
- 'llama3-8b-8192' : {'input': 0.05 / 1000000, 'output': 0.08 / 1000000},
15
- 'llama3-70b-8192' : {'input': 0.59 / 1000000, 'output': 0.79 / 1000000},
16
- 'claude-3-5-sonnet-20240620': {'input': 3/1000000, 'output': 15/1000000},
17
- 'claude-3-haiku-20240307': {'input': 0.25/1000000, 'output': 1.25/1000000},
18
- }
19
- def call_llm(client, model, system_prompt, prompt,
20
- temperature=0, seed=42, response_format=None, max_tokens=5000):
21
-
22
- response = client.chat.completions.create(
23
- messages=[
24
- {
25
- "role": "system",
26
- "content": system_prompt
27
- },
28
- {
29
- "role": "user",
30
- "content": prompt
31
- }
32
- ],
33
- model=model,
34
- temperature=temperature,
35
- seed=seed,
36
- response_format=response_format,
37
- max_tokens=max_tokens
38
- )
39
-
40
- nb_input_tokens = response.usage.prompt_tokens
41
- nb_output_tokens = response.usage.completion_tokens
42
- price = nb_input_tokens * price_token[model]['input'] + nb_output_tokens * price_token[model]['output']
43
-
44
- print(f"input tokens: {nb_input_tokens}; output tokens: {nb_output_tokens}, price: {price}")
45
-
46
- response_content=response.choices[0].message.content
47
-
48
- return response_content, nb_input_tokens, nb_output_tokens, price
49
-
50
- ########################### Step 2: Transcript to paragraph ###########################
51
-
52
- system_prompt_transcript_to_paragraphs = f"""
53
-
54
- You are a helpful assistant.
55
-
56
- Your task is to improve the user input's readability: add punctuation if needed, remove verbal tics, correct grammatical errors, and add appropriate line breaks with '\n\n'.
57
-
58
- Put your answer within <answer></answer> tags.
59
-
60
- """
61
-
62
-
63
-
64
- def transcript_to_paragraphs(transcript, llm_client, llm_model, chunk_size=5000, progress=None):
65
-
66
- transcript_as_text = ' '.join([s['text'] for s in transcript])
67
-
68
- paragraphs = []
69
- last_paragraph = ""
70
-
71
- total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0
72
-
73
- nb_chunks = int(len(transcript_as_text) / chunk_size) + 1
74
- progress_i = 0
75
- print(f"Number of chunks: {nb_chunks}")
76
-
77
- # for i in range(0, 10000, chunk_size):
78
- for i in range(0, len(transcript_as_text), chunk_size):
79
-
80
- print("i is: " + str(i))
81
-
82
- chunk = last_paragraph + " " + transcript_as_text[i:i + chunk_size]
83
-
84
- if progress is not None:
85
- progress_i += 1
86
- progress(progress_i / nb_chunks, desc="Processing")
87
-
88
- found_edited_transcript = False
89
-
90
- while not found_edited_transcript:
91
-
92
- response_content, nb_input_tokens, nb_output_tokens, price = \
93
- call_llm(llm_client, llm_model,
94
- system_prompt=system_prompt_transcript_to_paragraphs, prompt=chunk,
95
- temperature=0.2, seed=42, response_format=None)
96
-
97
- if not "</answer>" in response_content:
98
- response_content += "</answer>"
99
-
100
- # Extract content from <edited_transcript> tags
101
- pattern = re.compile(r'<answer>(.*?)</answer>', re.DOTALL)
102
- response_content_edited = pattern.findall(response_content)
103
-
104
- if len(response_content_edited) > 0:
105
- found_edited_transcript = True
106
- response_content_edited = response_content_edited[0]
107
-
108
- else:
109
- print("No edited transcript found. Trying again.")
110
- print(response_content[0:100])
111
- print(response_content[-100:])
112
-
113
- total_nb_input_tokens += nb_input_tokens
114
- total_nb_output_tokens += nb_output_tokens
115
- total_price += price
116
-
117
- paragraphs_chunk = response_content_edited.strip().split('\n\n')
118
-
119
- print('Found paragraphs:', len(paragraphs_chunk))
120
- last_paragraph = paragraphs_chunk[-1]
121
-
122
- paragraphs += paragraphs_chunk[:-1]
123
-
124
- paragraphs += [last_paragraph]
125
-
126
- paragraphs_dict = [{'paragraph_number': i, 'paragraph_text': paragraph} for i, paragraph in enumerate(paragraphs)]
127
-
128
- return paragraphs_dict, total_nb_input_tokens, total_nb_output_tokens, total_price
129
-
130
- ########################### Step 3: Infer timestamps ###########################
131
-
132
- def transform_text_segments(text_segments, num_words=50):
133
- # Initialize variables
134
- transformed_segments = []
135
- current_index = 0
136
- num_segments = len(text_segments)
137
-
138
- for i in range(num_segments):
139
-
140
- current_index = i
141
-
142
- # Get the current segment's starting timestamp and text
143
- current_segment = text_segments[current_index]
144
- current_text = current_segment['text']
145
-
146
- # Initialize a list to hold the combined text
147
- combined_text = " ".join(current_text.split()[:num_words])
148
- number_words_collected = len(current_text.split())
149
-
150
- # Collect words from subsequent segments
151
- while number_words_collected < num_words and (current_index + 1) < num_segments:
152
- current_index += 1
153
- next_segment = text_segments[current_index]
154
- next_text = next_segment['text']
155
- next_words = next_text.split()
156
-
157
- # Append words from the next segment
158
- if number_words_collected + len(next_words) <= num_words:
159
- combined_text += ' ' + next_text
160
- number_words_collected += len(next_words)
161
- else:
162
- # Only append enough words to reach the num_words limit
163
- words_needed = num_words - number_words_collected
164
- combined_text += ' ' + ' '.join(next_words[:words_needed])
165
- number_words_collected = num_words
166
-
167
- # Append the combined segment to the result
168
- transformed_segments.append(combined_text)
169
-
170
- return transformed_segments
171
-
172
-
173
- def add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50):
174
- list_indices = []
175
-
176
- transcript_num_words = transform_text_segments(transcript, num_words=num_words)
177
-
178
- paragraphs_start_text = [{"start": p['paragraph_number'], "text": p['paragraph_text']} for p in paragraphs]
179
- paragraphs_num_words = transform_text_segments(paragraphs_start_text, num_words=num_words)
180
-
181
- # Create a TF-IDF vectorizer
182
- vectorizer = TfidfVectorizer().fit_transform(transcript_num_words + paragraphs_num_words)
183
- # Get the TF-IDF vectors for the transcript and the excerpt
184
- vectors = vectorizer.toarray()
185
-
186
- for i in range(len(paragraphs_num_words)):
187
-
188
- # Extract the TF-IDF vector for the paragraph
189
- paragraph_vector = vectors[len(transcript_num_words) + i]
190
-
191
- # Calculate the cosine similarity between the paragraph vector and each transcript chunk
192
- similarities = cosine_similarity(vectors[:len(transcript_num_words)], paragraph_vector.reshape(1, -1))
193
- # Find the index of the most similar chunk
194
- best_match_index = int(np.argmax(similarities))
195
-
196
- list_indices.append(best_match_index)
197
-
198
- paragraphs[i]['matched_index'] = best_match_index
199
- paragraphs[i]['matched_text'] = transcript[best_match_index]['text']
200
- paragraphs[i]['start_time'] = int(transcript[best_match_index]['start']) - 2
201
- if paragraphs[i]['start_time'] < 0:
202
- paragraphs[i]['start_time'] = 0
203
-
204
- return paragraphs
205
-
206
- ########################### Step 4: Generate table of content ###########################
207
-
208
-
209
- system_prompt_paragraphs_to_toc = """
210
-
211
- You are a helpful assistant.
212
-
213
- You are given a transcript of a course in JSON format as a list of paragraphs, each containing 'paragraph_number' and 'paragraph_text' keys.
214
-
215
- Your task is to group consecutive paragraphs in chapters for the course and identify meaningful chapter titles.
216
-
217
- Here are the steps to follow:
218
-
219
- 1. Read the transcript carefully to understand its general structure and the main topics covered.
220
- 2. Look for clues that a new chapter is about to start. This could be a change of topic, a change of time or setting, the introduction of new themes or topics, or the speaker's explicit mention of a new part.
221
- 3. For each chapter, keep track of the paragraph number that starts the chapter and identify a meaningful chapter title.
222
- 4. Chapters should ideally be equally spaced throughout the transcript, and discuss a specific topic.
223
- 5. A chapter MUST have more than 4 paragraphs.
224
-
225
- Format your result in JSON, with a list dictionaries for chapters, with 'start_paragraph_number':integer and 'title':string as key:value.
226
-
227
- Example:
228
- {"chapters":
229
- [{"start_paragraph_number": 0, "title": "Introduction"},
230
- {"start_paragraph_number": 10, "title": "Chapter 1"}
231
- ]
232
- }
233
-
234
- """
235
-
236
-
237
- def paragraphs_to_toc(paragraphs, llm_client, llm_model, chunk_size=100):
238
- chapters = []
239
- number_last_chapter = 0
240
-
241
- total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0
242
-
243
- while number_last_chapter < len(paragraphs):
244
-
245
- print(number_last_chapter)
246
-
247
- chunk = paragraphs[number_last_chapter:(number_last_chapter + chunk_size)]
248
- chunk = [{'paragraph_number': p['paragraph_number'], 'paragraph_text': p['paragraph_text']} for p in chunk]
249
-
250
- chunk_json_dump = json.dumps(chunk)
251
-
252
- content, nb_input_tokens, nb_output_tokens, price = call_llm( \
253
- llm_client, llm_model, \
254
- system_prompt_paragraphs_to_toc, chunk_json_dump, \
255
- temperature=0, seed=42, response_format={"type": "json_object"})
256
-
257
- total_nb_input_tokens += nb_input_tokens
258
- total_nb_output_tokens += nb_output_tokens
259
-
260
- chapters_chunk = json.loads(content)['chapters']
261
-
262
- if number_last_chapter == chapters_chunk[-1]['start_paragraph_number']:
263
- break
264
-
265
- chapters += chapters_chunk[:-1]
266
-
267
- number_last_chapter = chapters_chunk[-1]['start_paragraph_number']
268
- if number_last_chapter >= len(paragraphs) - 5:
269
- break
270
-
271
- total_price = (total_nb_input_tokens * price_token[llm_model]['input'] +
272
- total_nb_output_tokens * price_token[llm_model]['output'])
273
-
274
- chapters += [chapters_chunk[-1]]
275
-
276
- return chapters, total_nb_input_tokens, total_nb_output_tokens, total_price
277
-
278
-
279
- ########################### Step 5: Chapter rendering functions ###########################
280
 
281
  def get_chapters(paragraphs, table_of_content):
282
 
@@ -417,8 +148,22 @@ def get_result_as_html(chapters, video_id):
417
 
418
  return result_as_html
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  def load_json_chapters(video_id):
421
- file_name = f"{video_id}.json"
422
  with open(file_name, 'r') as file:
423
  chapters = json.load(file)
424
 
 
1
  import json
2
  import re
3
 
4
+
5
  import numpy as np
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
 
9
 
10
+ ########################### Chapter rendering functions ###########################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def get_chapters(paragraphs, table_of_content):
13
 
 
148
 
149
  return result_as_html
150
 
151
+ def get_transcript_as_text(transcript):
152
+ temp_list = [convert_seconds_to_hms(int(s['start']))+' '+s['text'] for s in transcript]
153
+ transcript_as_text = '\n'.join(temp_list)
154
+
155
+ return transcript_as_text
156
+
157
+ def load_transcript(video_id):
158
+ file_name = f"examples/{video_id}_transcript.json"
159
+ with open(file_name, 'r') as file:
160
+ transcript = json.load(file)
161
+
162
+ transcript_as_text = get_transcript_as_text(transcript)
163
+ return transcript_as_text
164
+
165
  def load_json_chapters(video_id):
166
+ file_name = f"examples/{video_id}.json"
167
  with open(file_name, 'r') as file:
168
  chapters = json.load(file)
169