import json import re import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity ########################### Chapter rendering functions ########################### def get_chapters(paragraphs, table_of_content): chapters = [] for i in range(len(table_of_content)): if i < len(table_of_content) - 1: chapter = {'num_chapter': i, 'title': table_of_content[i]['title'], 'start_paragraph_number': table_of_content[i]['start_paragraph_number'], 'end_paragraph_number': table_of_content[i + 1]['start_paragraph_number'], 'start_time': paragraphs[table_of_content[i]['start_paragraph_number']]['start_time'], 'end_time': paragraphs[table_of_content[i + 1]['start_paragraph_number']]['start_time'], } else: chapter = {'num_chapter': i, 'title': table_of_content[i]['title'], 'start_paragraph_number': table_of_content[i]['start_paragraph_number'], 'end_paragraph_number': len(paragraphs), 'start_time': paragraphs[table_of_content[i]['start_paragraph_number']]['start_time'], 'end_time': paragraphs[-1]['start_time'], } paragraphs_chapter = [paragraphs[j]['paragraph_text'] for j in range(chapter['start_paragraph_number'], chapter['end_paragraph_number'])] paragraph_timestamps_chapter = [paragraphs[j]['start_time'] for j in range(chapter['start_paragraph_number'], chapter['end_paragraph_number'])] chapter['paragraphs'] = paragraphs_chapter chapter['paragraph_timestamps'] = paragraph_timestamps_chapter chapters.append(chapter) return chapters def convert_seconds_to_hms(seconds): # Calculate hours, minutes, and remaining seconds hours = seconds // 3600 minutes = (seconds % 3600) // 60 remaining_seconds = seconds % 60 # Format the result as HH:MM:SS return f"{hours:02}:{minutes:02}:{remaining_seconds:02}" def toc_to_html(chapters): toc_html = "

Video chapters

\n" for chapter in chapters: num_chapter = chapter['num_chapter'] title = chapter['title'] from_to = convert_seconds_to_hms(int(chapter['start_time'])) + " - " toc_html += f"""{from_to}{num_chapter+1} - {title}
\n""" return toc_html def section_to_html(section_json_data): formatted_section = "" paragraphs = section_json_data['paragraphs'] paragraphs_timestamp_hms = [convert_seconds_to_hms(int(section_json_data['paragraph_timestamps'][i])) for i in range(len(paragraphs))] for i, (paragraph, paragraph_timestamp_hms) in enumerate(zip(paragraphs, paragraphs_timestamp_hms)): formatted_section += f"""

{paragraph_timestamp_hms}

{paragraph}

""" num_section = section_json_data['num_chapter'] from_to = "From "+convert_seconds_to_hms(int(section_json_data['start_time'])) + " to " + convert_seconds_to_hms( int(section_json_data['end_time'])) title = f"{section_json_data['title']}" title_link = f"""
{num_section+1} - {title}
""" summary_section = f"""

{title_link}

{from_to}

{formatted_section}
""" return summary_section def get_result_as_html(chapters, video_id): video_embed = f""" """ toc = toc_to_html(chapters) edited_transcript = f"""

Structured transcript

""" for i in range(len(chapters)): chapter_json_data = chapters[i] edited_transcript += section_to_html(chapter_json_data) result_as_html = f"""

{video_embed}

{toc}

{edited_transcript}
""" return result_as_html def get_transcript_as_text(transcript): temp_list = [convert_seconds_to_hms(int(s['start']))+' '+s['text'] for s in transcript] transcript_as_text = '\n'.join(temp_list) return transcript_as_text def load_transcript(video_id): file_name = f"examples/{video_id}_transcript.json" with open(file_name, 'r') as file: transcript = json.load(file) transcript_as_text = get_transcript_as_text(transcript) return transcript_as_text def load_json_chapters(video_id): file_name = f"examples/{video_id}.json" with open(file_name, 'r') as file: chapters = json.load(file) return chapters