Spaces:
Running
Running
Yannael_LB
commited on
Commit
·
12360eb
1
Parent(s):
8282d2e
Update
Browse files- app.py +29 -138
- ErnWZxJovaM.json → examples/ErnWZxJovaM.json +0 -0
- examples/ErnWZxJovaM_transcript.json +0 -0
- utils.py +17 -272
app.py
CHANGED
@@ -1,114 +1,31 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
|
4 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
5 |
-
|
6 |
import utils
|
7 |
|
8 |
-
|
9 |
-
from groq import Groq
|
10 |
-
|
11 |
-
from dotenv import load_dotenv
|
12 |
-
load_dotenv()
|
13 |
-
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
14 |
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
15 |
-
|
16 |
-
|
17 |
-
#import importlib
|
18 |
-
#importlib.reload(utils)
|
19 |
-
|
20 |
-
def get_llm_client_and_model(llm_model):
|
21 |
-
if llm_model == "llama3-8b":
|
22 |
-
llm_client = Groq(api_key=GROQ_API_KEY)
|
23 |
-
llm_model = 'llama3-8b-8192'
|
24 |
-
|
25 |
-
elif llm_model == "gpt-4o-mini":
|
26 |
-
llm_client = OpenAI(api_key=OPENAI_API_KEY)
|
27 |
-
llm_model = 'gpt-4o-mini-2024-07-18'
|
28 |
-
|
29 |
-
return llm_client, llm_model
|
30 |
-
|
31 |
-
|
32 |
-
def gradio_process_video(video_id,
|
33 |
-
model_format_transcript, model_toc,
|
34 |
-
chunk_size_format_transcript, chunk_size_toc,
|
35 |
-
progress=gr.Progress()):
|
36 |
-
if video_id in ["ErnWZxJovaM"]:
|
37 |
-
chapters = utils.load_json_chapters(video_id)
|
38 |
-
|
39 |
-
else:
|
40 |
-
|
41 |
-
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
|
42 |
-
|
43 |
-
chunk_size_format_transcript = int(chunk_size_format_transcript)
|
44 |
-
|
45 |
-
llm_client_format_transcript, llm_model_format_transcript = \
|
46 |
-
get_llm_client_and_model(model_format_transcript)
|
47 |
-
|
48 |
-
paragraphs, nb_input_tokens, nb_output_tokens, price = \
|
49 |
-
utils.transcript_to_paragraphs(transcript, \
|
50 |
-
llm_client_format_transcript, llm_model_format_transcript, \
|
51 |
-
chunk_size=chunk_size_format_transcript, progress=progress)
|
52 |
-
|
53 |
-
paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)
|
54 |
-
|
55 |
-
chunk_size_toc = int(chunk_size_toc)
|
56 |
-
|
57 |
-
llm_client_get_toc, llm_model_get_toc = \
|
58 |
-
get_llm_client_and_model(model_toc)
|
59 |
-
|
60 |
-
json_toc, nb_input_tokens, nb_output_tokens, price = \
|
61 |
-
utils.paragraphs_to_toc(paragraphs, \
|
62 |
-
llm_client_get_toc, llm_model_get_toc, \
|
63 |
-
chunk_size=chunk_size_toc)
|
64 |
-
|
65 |
-
chapters = utils.get_chapters(paragraphs, json_toc)
|
66 |
-
|
67 |
-
output_html = utils.get_result_as_html(chapters, video_id)
|
68 |
-
|
69 |
-
return {output_processing: str(output_html),
|
70 |
-
gv_output: output_html}
|
71 |
-
|
72 |
-
|
73 |
-
def gradio_process_video(video_id,
|
74 |
-
model_format_transcript, model_toc,
|
75 |
-
chunk_size_format_transcript, chunk_size_toc,
|
76 |
-
progress=gr.Progress()):
|
77 |
-
if video_id in ["ErnWZxJovaM"]:
|
78 |
-
chapters = utils.load_json_chapters(video_id)
|
79 |
-
|
80 |
-
else:
|
81 |
-
|
82 |
-
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
|
83 |
-
|
84 |
-
chunk_size_format_transcript = int(chunk_size_format_transcript)
|
85 |
-
|
86 |
-
llm_client_format_transcript, llm_model_format_transcript = \
|
87 |
-
get_llm_client_and_model(model_format_transcript)
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
chunk_size=chunk_size_format_transcript, progress=progress)
|
93 |
|
94 |
-
|
|
|
|
|
95 |
|
96 |
-
|
97 |
|
98 |
-
|
99 |
-
get_llm_client_and_model(model_toc)
|
100 |
|
101 |
-
|
102 |
-
utils.paragraphs_to_toc(paragraphs, \
|
103 |
-
llm_client_get_toc, llm_model_get_toc, \
|
104 |
-
chunk_size=chunk_size_toc)
|
105 |
|
106 |
-
|
107 |
|
|
|
108 |
output_html = utils.get_result_as_html(chapters, video_id)
|
109 |
|
110 |
-
return {output_processing:
|
111 |
-
|
112 |
|
113 |
|
114 |
# %%
|
@@ -123,58 +40,32 @@ css = """
|
|
123 |
}
|
124 |
"""
|
125 |
|
126 |
-
example_video_id = "ErnWZxJovaM"
|
127 |
-
example_chapters = utils.load_json_chapters(example_video_id)
|
128 |
-
example_output_html = utils.get_result_as_html(example_chapters, example_video_id)
|
129 |
-
|
130 |
with (gr.Blocks(css=css) as app):
|
131 |
-
gr.HTML("<div align='center'><h1
|
132 |
-
gr.HTML("<div align='center'><
|
|
|
133 |
gr.HTML("<hr>")
|
134 |
-
gr.Markdown("""This demo relies on
|
135 |
-
- Groq's Llama 3 8B for transcript preprocessing
|
136 |
-
- OpenAI's GPT-4o-mini for chaptering. Note: Using GPT-4o-mini for transcript preprocessing will improve results, but takes longer (around 2/3 minutes for a one-hour video)
|
137 |
|
138 |
-
|
139 |
|
140 |
-
|
141 |
-
|
|
|
142 |
|
143 |
-
|
144 |
-
)
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
video_id_input = gr.Textbox(label="Enter YouTube Video ID", value="EuC1GWhQdKE")
|
149 |
-
|
150 |
-
with gr.Accordion("Set parameters", open=False):
|
151 |
-
with gr.Row():
|
152 |
-
with gr.Column(scale=1):
|
153 |
-
model_format_transcript = gr.Dropdown(
|
154 |
-
[("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
|
155 |
-
label="Transcript preprocessing", value="llama3-8b", interactive=True)
|
156 |
-
chunk_size_format_transcript = gr.Textbox(label="Preprocessing chunk size", value=2000)
|
157 |
-
with gr.Column(scale=1):
|
158 |
-
model_toc = gr.Dropdown([("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
|
159 |
-
label="Chaptering", value="gpt-4o-mini", interactive=True)
|
160 |
-
chunk_size_toc = gr.Textbox(label="Chaptering chunk size", value=30)
|
161 |
-
with gr.Column(scale=1):
|
162 |
-
api_key_openai = gr.Textbox(label="OpenAI API Key", value="xxx")
|
163 |
-
api_key_groq = gr.Textbox(label="Groq API Key", value="xxx")
|
164 |
-
|
165 |
-
processing_button = gr.Button("Process transcript")
|
166 |
-
|
167 |
-
gv_output = gr.State()
|
168 |
|
169 |
gr.HTML("<hr>")
|
170 |
|
|
|
|
|
|
|
171 |
output_processing = gr.HTML(label="Output processing", value=example_output_html)
|
172 |
|
173 |
-
|
174 |
-
inputs=[video_id_input,
|
175 |
-
|
176 |
-
chunk_size_format_transcript, chunk_size_toc],
|
177 |
-
outputs=[output_processing, gv_output])
|
178 |
|
179 |
# gr.HTML(result_as_html)
|
180 |
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
|
|
|
|
|
4 |
import utils
|
5 |
|
6 |
+
example_video_id = "ErnWZxJovaM"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
example_output_transcript = utils.load_transcript(example_video_id)
|
9 |
+
example_chapters = utils.load_json_chapters(example_video_id)
|
10 |
+
example_output_html = utils.get_result_as_html(example_chapters, example_video_id)
|
|
|
11 |
|
12 |
+
example_video_id_dict = {"MIT Introduction to Deep Learning | 6.S191 - Alexander Amini": "ErnWZxJovaM",
|
13 |
+
"dog": "b8HO6hba9ZE",
|
14 |
+
"bird": "EuC1GWhQdKE"}
|
15 |
|
16 |
+
example_video_names = list(example_video_id_dict.keys())
|
17 |
|
18 |
+
def gradio_load_example(example_video):
|
|
|
19 |
|
20 |
+
video_id = example_video_id_dict[example_video]
|
|
|
|
|
|
|
21 |
|
22 |
+
transcript_as_text = utils.load_transcript(video_id)
|
23 |
|
24 |
+
chapters = utils.load_json_chapters(video_id)
|
25 |
output_html = utils.get_result_as_html(chapters, video_id)
|
26 |
|
27 |
+
return {output_processing: output_html,
|
28 |
+
output_transcript: transcript_as_text}
|
29 |
|
30 |
|
31 |
# %%
|
|
|
40 |
}
|
41 |
"""
|
42 |
|
|
|
|
|
|
|
|
|
43 |
with (gr.Blocks(css=css) as app):
|
44 |
+
gr.HTML("<div align='center'><h1>Demo: Automatic video chaptering with LLMs and TF-IDF</h1></div>")
|
45 |
+
gr.HTML("<div align='center'><h2>From raw transcript to structured document</h2></div>")
|
46 |
+
gr.HTML("<div align='center'><h3>See the companion <a href=''>Medium article</a> and <a href=''>Github repository</a> for more details</h3>")
|
47 |
gr.HTML("<hr>")
|
|
|
|
|
|
|
48 |
|
49 |
+
#gv_transcript = gr.State()
|
50 |
|
51 |
+
video_id_input = gr.Dropdown(choices=example_video_names,
|
52 |
+
label="Choose a video to see the structured transcript",
|
53 |
+
value=example_video_names[0])
|
54 |
|
55 |
+
load_button = gr.Button("Load example")
|
|
|
56 |
|
57 |
+
#gv_output = gr.State()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
gr.HTML("<hr>")
|
60 |
|
61 |
+
with gr.Accordion("See raw transcript", open=False):
|
62 |
+
output_transcript = gr.Textbox(value=example_output_transcript, max_lines=10, lines=10, label="Raw transcript")
|
63 |
+
|
64 |
output_processing = gr.HTML(label="Output processing", value=example_output_html)
|
65 |
|
66 |
+
load_button.click(gradio_load_example,
|
67 |
+
inputs=[video_id_input],
|
68 |
+
outputs=[output_processing, output_transcript])
|
|
|
|
|
69 |
|
70 |
# gr.HTML(result_as_html)
|
71 |
|
ErnWZxJovaM.json → examples/ErnWZxJovaM.json
RENAMED
File without changes
|
examples/ErnWZxJovaM_transcript.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
utils.py
CHANGED
@@ -1,282 +1,13 @@
|
|
1 |
import json
|
2 |
import re
|
3 |
|
|
|
4 |
import numpy as np
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
|
8 |
|
9 |
-
###########################
|
10 |
-
|
11 |
-
price_token={'gpt-4o': {'input': 5/1000000, 'output': 15/1000000},
|
12 |
-
'gpt-4o-2024-08-06': {'input': 2.5/1000000, 'output': 10/1000000},
|
13 |
-
'gpt-4o-mini-2024-07-18': {'input': 0.15/1000000, 'output': 0.6/1000000},
|
14 |
-
'llama3-8b-8192' : {'input': 0.05 / 1000000, 'output': 0.08 / 1000000},
|
15 |
-
'llama3-70b-8192' : {'input': 0.59 / 1000000, 'output': 0.79 / 1000000},
|
16 |
-
'claude-3-5-sonnet-20240620': {'input': 3/1000000, 'output': 15/1000000},
|
17 |
-
'claude-3-haiku-20240307': {'input': 0.25/1000000, 'output': 1.25/1000000},
|
18 |
-
}
|
19 |
-
def call_llm(client, model, system_prompt, prompt,
|
20 |
-
temperature=0, seed=42, response_format=None, max_tokens=5000):
|
21 |
-
|
22 |
-
response = client.chat.completions.create(
|
23 |
-
messages=[
|
24 |
-
{
|
25 |
-
"role": "system",
|
26 |
-
"content": system_prompt
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"role": "user",
|
30 |
-
"content": prompt
|
31 |
-
}
|
32 |
-
],
|
33 |
-
model=model,
|
34 |
-
temperature=temperature,
|
35 |
-
seed=seed,
|
36 |
-
response_format=response_format,
|
37 |
-
max_tokens=max_tokens
|
38 |
-
)
|
39 |
-
|
40 |
-
nb_input_tokens = response.usage.prompt_tokens
|
41 |
-
nb_output_tokens = response.usage.completion_tokens
|
42 |
-
price = nb_input_tokens * price_token[model]['input'] + nb_output_tokens * price_token[model]['output']
|
43 |
-
|
44 |
-
print(f"input tokens: {nb_input_tokens}; output tokens: {nb_output_tokens}, price: {price}")
|
45 |
-
|
46 |
-
response_content=response.choices[0].message.content
|
47 |
-
|
48 |
-
return response_content, nb_input_tokens, nb_output_tokens, price
|
49 |
-
|
50 |
-
########################### Step 2: Transcript to paragraph ###########################
|
51 |
-
|
52 |
-
system_prompt_transcript_to_paragraphs = f"""
|
53 |
-
|
54 |
-
You are a helpful assistant.
|
55 |
-
|
56 |
-
Your task is to improve the user input's readability: add punctuation if needed, remove verbal tics, correct grammatical errors, and add appropriate line breaks with '\n\n'.
|
57 |
-
|
58 |
-
Put your answer within <answer></answer> tags.
|
59 |
-
|
60 |
-
"""
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
def transcript_to_paragraphs(transcript, llm_client, llm_model, chunk_size=5000, progress=None):
|
65 |
-
|
66 |
-
transcript_as_text = ' '.join([s['text'] for s in transcript])
|
67 |
-
|
68 |
-
paragraphs = []
|
69 |
-
last_paragraph = ""
|
70 |
-
|
71 |
-
total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0
|
72 |
-
|
73 |
-
nb_chunks = int(len(transcript_as_text) / chunk_size) + 1
|
74 |
-
progress_i = 0
|
75 |
-
print(f"Number of chunks: {nb_chunks}")
|
76 |
-
|
77 |
-
# for i in range(0, 10000, chunk_size):
|
78 |
-
for i in range(0, len(transcript_as_text), chunk_size):
|
79 |
-
|
80 |
-
print("i is: " + str(i))
|
81 |
-
|
82 |
-
chunk = last_paragraph + " " + transcript_as_text[i:i + chunk_size]
|
83 |
-
|
84 |
-
if progress is not None:
|
85 |
-
progress_i += 1
|
86 |
-
progress(progress_i / nb_chunks, desc="Processing")
|
87 |
-
|
88 |
-
found_edited_transcript = False
|
89 |
-
|
90 |
-
while not found_edited_transcript:
|
91 |
-
|
92 |
-
response_content, nb_input_tokens, nb_output_tokens, price = \
|
93 |
-
call_llm(llm_client, llm_model,
|
94 |
-
system_prompt=system_prompt_transcript_to_paragraphs, prompt=chunk,
|
95 |
-
temperature=0.2, seed=42, response_format=None)
|
96 |
-
|
97 |
-
if not "</answer>" in response_content:
|
98 |
-
response_content += "</answer>"
|
99 |
-
|
100 |
-
# Extract content from <edited_transcript> tags
|
101 |
-
pattern = re.compile(r'<answer>(.*?)</answer>', re.DOTALL)
|
102 |
-
response_content_edited = pattern.findall(response_content)
|
103 |
-
|
104 |
-
if len(response_content_edited) > 0:
|
105 |
-
found_edited_transcript = True
|
106 |
-
response_content_edited = response_content_edited[0]
|
107 |
-
|
108 |
-
else:
|
109 |
-
print("No edited transcript found. Trying again.")
|
110 |
-
print(response_content[0:100])
|
111 |
-
print(response_content[-100:])
|
112 |
-
|
113 |
-
total_nb_input_tokens += nb_input_tokens
|
114 |
-
total_nb_output_tokens += nb_output_tokens
|
115 |
-
total_price += price
|
116 |
-
|
117 |
-
paragraphs_chunk = response_content_edited.strip().split('\n\n')
|
118 |
-
|
119 |
-
print('Found paragraphs:', len(paragraphs_chunk))
|
120 |
-
last_paragraph = paragraphs_chunk[-1]
|
121 |
-
|
122 |
-
paragraphs += paragraphs_chunk[:-1]
|
123 |
-
|
124 |
-
paragraphs += [last_paragraph]
|
125 |
-
|
126 |
-
paragraphs_dict = [{'paragraph_number': i, 'paragraph_text': paragraph} for i, paragraph in enumerate(paragraphs)]
|
127 |
-
|
128 |
-
return paragraphs_dict, total_nb_input_tokens, total_nb_output_tokens, total_price
|
129 |
-
|
130 |
-
########################### Step 3: Infer timestamps ###########################
|
131 |
-
|
132 |
-
def transform_text_segments(text_segments, num_words=50):
|
133 |
-
# Initialize variables
|
134 |
-
transformed_segments = []
|
135 |
-
current_index = 0
|
136 |
-
num_segments = len(text_segments)
|
137 |
-
|
138 |
-
for i in range(num_segments):
|
139 |
-
|
140 |
-
current_index = i
|
141 |
-
|
142 |
-
# Get the current segment's starting timestamp and text
|
143 |
-
current_segment = text_segments[current_index]
|
144 |
-
current_text = current_segment['text']
|
145 |
-
|
146 |
-
# Initialize a list to hold the combined text
|
147 |
-
combined_text = " ".join(current_text.split()[:num_words])
|
148 |
-
number_words_collected = len(current_text.split())
|
149 |
-
|
150 |
-
# Collect words from subsequent segments
|
151 |
-
while number_words_collected < num_words and (current_index + 1) < num_segments:
|
152 |
-
current_index += 1
|
153 |
-
next_segment = text_segments[current_index]
|
154 |
-
next_text = next_segment['text']
|
155 |
-
next_words = next_text.split()
|
156 |
-
|
157 |
-
# Append words from the next segment
|
158 |
-
if number_words_collected + len(next_words) <= num_words:
|
159 |
-
combined_text += ' ' + next_text
|
160 |
-
number_words_collected += len(next_words)
|
161 |
-
else:
|
162 |
-
# Only append enough words to reach the num_words limit
|
163 |
-
words_needed = num_words - number_words_collected
|
164 |
-
combined_text += ' ' + ' '.join(next_words[:words_needed])
|
165 |
-
number_words_collected = num_words
|
166 |
-
|
167 |
-
# Append the combined segment to the result
|
168 |
-
transformed_segments.append(combined_text)
|
169 |
-
|
170 |
-
return transformed_segments
|
171 |
-
|
172 |
-
|
173 |
-
def add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50):
|
174 |
-
list_indices = []
|
175 |
-
|
176 |
-
transcript_num_words = transform_text_segments(transcript, num_words=num_words)
|
177 |
-
|
178 |
-
paragraphs_start_text = [{"start": p['paragraph_number'], "text": p['paragraph_text']} for p in paragraphs]
|
179 |
-
paragraphs_num_words = transform_text_segments(paragraphs_start_text, num_words=num_words)
|
180 |
-
|
181 |
-
# Create a TF-IDF vectorizer
|
182 |
-
vectorizer = TfidfVectorizer().fit_transform(transcript_num_words + paragraphs_num_words)
|
183 |
-
# Get the TF-IDF vectors for the transcript and the excerpt
|
184 |
-
vectors = vectorizer.toarray()
|
185 |
-
|
186 |
-
for i in range(len(paragraphs_num_words)):
|
187 |
-
|
188 |
-
# Extract the TF-IDF vector for the paragraph
|
189 |
-
paragraph_vector = vectors[len(transcript_num_words) + i]
|
190 |
-
|
191 |
-
# Calculate the cosine similarity between the paragraph vector and each transcript chunk
|
192 |
-
similarities = cosine_similarity(vectors[:len(transcript_num_words)], paragraph_vector.reshape(1, -1))
|
193 |
-
# Find the index of the most similar chunk
|
194 |
-
best_match_index = int(np.argmax(similarities))
|
195 |
-
|
196 |
-
list_indices.append(best_match_index)
|
197 |
-
|
198 |
-
paragraphs[i]['matched_index'] = best_match_index
|
199 |
-
paragraphs[i]['matched_text'] = transcript[best_match_index]['text']
|
200 |
-
paragraphs[i]['start_time'] = int(transcript[best_match_index]['start']) - 2
|
201 |
-
if paragraphs[i]['start_time'] < 0:
|
202 |
-
paragraphs[i]['start_time'] = 0
|
203 |
-
|
204 |
-
return paragraphs
|
205 |
-
|
206 |
-
########################### Step 4: Generate table of content ###########################
|
207 |
-
|
208 |
-
|
209 |
-
system_prompt_paragraphs_to_toc = """
|
210 |
-
|
211 |
-
You are a helpful assistant.
|
212 |
-
|
213 |
-
You are given a transcript of a course in JSON format as a list of paragraphs, each containing 'paragraph_number' and 'paragraph_text' keys.
|
214 |
-
|
215 |
-
Your task is to group consecutive paragraphs in chapters for the course and identify meaningful chapter titles.
|
216 |
-
|
217 |
-
Here are the steps to follow:
|
218 |
-
|
219 |
-
1. Read the transcript carefully to understand its general structure and the main topics covered.
|
220 |
-
2. Look for clues that a new chapter is about to start. This could be a change of topic, a change of time or setting, the introduction of new themes or topics, or the speaker's explicit mention of a new part.
|
221 |
-
3. For each chapter, keep track of the paragraph number that starts the chapter and identify a meaningful chapter title.
|
222 |
-
4. Chapters should ideally be equally spaced throughout the transcript, and discuss a specific topic.
|
223 |
-
5. A chapter MUST have more than 4 paragraphs.
|
224 |
-
|
225 |
-
Format your result in JSON, with a list dictionaries for chapters, with 'start_paragraph_number':integer and 'title':string as key:value.
|
226 |
-
|
227 |
-
Example:
|
228 |
-
{"chapters":
|
229 |
-
[{"start_paragraph_number": 0, "title": "Introduction"},
|
230 |
-
{"start_paragraph_number": 10, "title": "Chapter 1"}
|
231 |
-
]
|
232 |
-
}
|
233 |
-
|
234 |
-
"""
|
235 |
-
|
236 |
-
|
237 |
-
def paragraphs_to_toc(paragraphs, llm_client, llm_model, chunk_size=100):
|
238 |
-
chapters = []
|
239 |
-
number_last_chapter = 0
|
240 |
-
|
241 |
-
total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0
|
242 |
-
|
243 |
-
while number_last_chapter < len(paragraphs):
|
244 |
-
|
245 |
-
print(number_last_chapter)
|
246 |
-
|
247 |
-
chunk = paragraphs[number_last_chapter:(number_last_chapter + chunk_size)]
|
248 |
-
chunk = [{'paragraph_number': p['paragraph_number'], 'paragraph_text': p['paragraph_text']} for p in chunk]
|
249 |
-
|
250 |
-
chunk_json_dump = json.dumps(chunk)
|
251 |
-
|
252 |
-
content, nb_input_tokens, nb_output_tokens, price = call_llm( \
|
253 |
-
llm_client, llm_model, \
|
254 |
-
system_prompt_paragraphs_to_toc, chunk_json_dump, \
|
255 |
-
temperature=0, seed=42, response_format={"type": "json_object"})
|
256 |
-
|
257 |
-
total_nb_input_tokens += nb_input_tokens
|
258 |
-
total_nb_output_tokens += nb_output_tokens
|
259 |
-
|
260 |
-
chapters_chunk = json.loads(content)['chapters']
|
261 |
-
|
262 |
-
if number_last_chapter == chapters_chunk[-1]['start_paragraph_number']:
|
263 |
-
break
|
264 |
-
|
265 |
-
chapters += chapters_chunk[:-1]
|
266 |
-
|
267 |
-
number_last_chapter = chapters_chunk[-1]['start_paragraph_number']
|
268 |
-
if number_last_chapter >= len(paragraphs) - 5:
|
269 |
-
break
|
270 |
-
|
271 |
-
total_price = (total_nb_input_tokens * price_token[llm_model]['input'] +
|
272 |
-
total_nb_output_tokens * price_token[llm_model]['output'])
|
273 |
-
|
274 |
-
chapters += [chapters_chunk[-1]]
|
275 |
-
|
276 |
-
return chapters, total_nb_input_tokens, total_nb_output_tokens, total_price
|
277 |
-
|
278 |
-
|
279 |
-
########################### Step 5: Chapter rendering functions ###########################
|
280 |
|
281 |
def get_chapters(paragraphs, table_of_content):
|
282 |
|
@@ -417,8 +148,22 @@ def get_result_as_html(chapters, video_id):
|
|
417 |
|
418 |
return result_as_html
|
419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
def load_json_chapters(video_id):
|
421 |
-
file_name = f"{video_id}.json"
|
422 |
with open(file_name, 'r') as file:
|
423 |
chapters = json.load(file)
|
424 |
|
|
|
1 |
import json
|
2 |
import re
|
3 |
|
4 |
+
|
5 |
import numpy as np
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
|
9 |
|
10 |
+
########################### Chapter rendering functions ###########################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def get_chapters(paragraphs, table_of_content):
|
13 |
|
|
|
148 |
|
149 |
return result_as_html
|
150 |
|
151 |
+
def get_transcript_as_text(transcript):
|
152 |
+
temp_list = [convert_seconds_to_hms(int(s['start']))+' '+s['text'] for s in transcript]
|
153 |
+
transcript_as_text = '\n'.join(temp_list)
|
154 |
+
|
155 |
+
return transcript_as_text
|
156 |
+
|
157 |
+
def load_transcript(video_id):
|
158 |
+
file_name = f"examples/{video_id}_transcript.json"
|
159 |
+
with open(file_name, 'r') as file:
|
160 |
+
transcript = json.load(file)
|
161 |
+
|
162 |
+
transcript_as_text = get_transcript_as_text(transcript)
|
163 |
+
return transcript_as_text
|
164 |
+
|
165 |
def load_json_chapters(video_id):
|
166 |
+
file_name = f"examples/{video_id}.json"
|
167 |
with open(file_name, 'r') as file:
|
168 |
chapters = json.load(file)
|
169 |
|