dwarkesh commited on
Commit
55e1fc0
·
1 Parent(s): b3a8488

a good start

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.gitignore CHANGED
@@ -3,3 +3,4 @@ __pycache__/
3
  */__pycache__/
4
  .DS_Store
5
  *.pyc
 
 
3
  */__pycache__/
4
  .DS_Store
5
  *.pyc
6
+ .venv/
README.md CHANGED
@@ -1,8 +1,32 @@
1
- # Run the reader app
2
- python apps/reader.py
3
 
4
- # Run the producer app
5
- python apps/producer.py
6
 
7
- # Run a script
8
- python scripts/transcript.py audio_file.mp3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Podcast Content Generator
 
2
 
3
+ A Gradio app that helps podcast producers generate preview clips, timestamps, descriptions, and more from podcast transcripts or YouTube videos.
 
4
 
5
+ ## Features
6
+
7
+ - Generate preview clips suggestions
8
+ - Create Twitter/social media clips
9
+ - Generate episode descriptions
10
+ - Create timestamps
11
+ - Get title and thumbnail suggestions
12
+ - Support for YouTube URLs or raw transcript text
13
+ - Customizable prompts for each type of content
14
+
15
+ ## Usage
16
+
17
+ 1. Paste a YouTube URL or transcript text into the input box
18
+ 2. Click "Generate Content" to process
19
+ 3. Get generated content in various formats
20
+ 4. Optionally customize the prompts used for generation
21
+
22
+ ## Environment Variables
23
+
24
+ The app requires the following environment variable:
25
+ - `ANTHROPIC_API_KEY`: Your Anthropic API key for Claude
26
+
27
+ ## Credits
28
+
29
+ Built with:
30
+ - Gradio
31
+ - Claude AI (Anthropic)
32
+ - YouTube Transcript API
app.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ from pathlib import Path
4
+ import anthropic
5
+ import os
6
+ from dataclasses import dataclass
7
+ from typing import Dict
8
+ from youtube_transcript_api import YouTubeTranscriptApi
9
+ import re
10
+ import pandas as pd
11
+
12
+ # Move relevant classes and functions into app.py
13
+ @dataclass
14
+ class ContentRequest:
15
+ prompt_key: str
16
+
17
+ class ContentGenerator:
18
+ def __init__(self):
19
+ self.current_prompts = self._load_default_prompts()
20
+ self.client = anthropic.Anthropic()
21
+
22
+ def _load_default_prompts(self) -> Dict[str, str]:
23
+ """Load default prompts and examples from files and CSVs."""
24
+
25
+ # Load CSV examples
26
+ try:
27
+ timestamps_df = pd.read_csv("data/Timestamps.csv")
28
+ titles_df = pd.read_csv("data/Titles & Thumbnails.csv")
29
+ descriptions_df = pd.read_csv("data/Viral Episode Descriptions.csv")
30
+ clips_df = pd.read_csv("data/Viral Twitter Clips.csv")
31
+
32
+ # Format timestamp examples
33
+ timestamp_examples = "\n\n".join(timestamps_df['Timestamps'].dropna().tolist())
34
+
35
+ # Format title examples
36
+ title_examples = "\n".join([
37
+ f'Title: "{row.Titles}"\nThumbnail: "{row.Thumbnail}"'
38
+ for _, row in titles_df.iterrows()
39
+ ])
40
+
41
+ # Format description examples
42
+ description_examples = "\n".join([
43
+ f'Tweet: "{row["Tweet Text"]}"'
44
+ for _, row in descriptions_df.iterrows()
45
+ ])
46
+
47
+ # Format clip examples
48
+ clip_examples = "\n\n".join([
49
+ f'Tweet Text: "{row["Tweet Text"]}"\nClip Transcript: "{row["Clip Transcript"]}"'
50
+ for _, row in clips_df.iterrows() if pd.notna(row["Tweet Text"])
51
+ ])
52
+
53
+ except Exception as e:
54
+ print(f"Warning: Error loading CSV examples: {e}")
55
+ timestamp_examples = ""
56
+ title_examples = ""
57
+ description_examples = ""
58
+ clip_examples = ""
59
+
60
+ # Load base prompts and inject examples
61
+ prompts = {}
62
+ for key in ["previews", "clips", "description", "timestamps", "titles_and_thumbnails"]:
63
+ prompt = Path(f"prompts/{key}.txt").read_text()
64
+
65
+ # Inject relevant examples
66
+ if key == "timestamps":
67
+ prompt = prompt.replace("{timestamps_examples}", timestamp_examples)
68
+ elif key == "titles_and_thumbnails":
69
+ prompt = prompt.replace("{title_examples}", title_examples)
70
+ elif key == "description":
71
+ prompt = prompt.replace("{description_examples}", description_examples)
72
+ elif key == "clips":
73
+ prompt = prompt.replace("{clip_examples}", clip_examples)
74
+
75
+ prompts[key] = prompt
76
+
77
+ return prompts
78
+
79
+ async def generate_content(self, request: ContentRequest, transcript: str) -> str:
80
+ """Generate content using Claude asynchronously."""
81
+ try:
82
+ print(f"\nFull prompt for {request.prompt_key}:")
83
+ print("=== SYSTEM PROMPT ===")
84
+ print(self.current_prompts[request.prompt_key])
85
+ print("=== END SYSTEM PROMPT ===\n")
86
+
87
+ response = self.client.messages.create(
88
+ model="claude-3-5-sonnet-20241022",
89
+ max_tokens=8192,
90
+ system=self.current_prompts[request.prompt_key],
91
+ messages=[{"role": "user", "content": f"Process this transcript:\n\n{transcript}"}]
92
+ )
93
+
94
+ if response and hasattr(response, 'content'):
95
+ return response.content[0].text
96
+ else:
97
+ return f"Error: Unexpected response structure for {request.prompt_key}"
98
+
99
+ except Exception as e:
100
+ return f"Error generating content: {str(e)}"
101
+
102
+ def extract_video_id(url: str) -> str:
103
+ """Extract video ID from various YouTube URL formats."""
104
+ match = re.search(
105
+ r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([A-Za-z0-9_-]+)",
106
+ url
107
+ )
108
+ return match.group(1) if match else None
109
+
110
+ def get_transcript(video_id: str) -> str:
111
+ """Get transcript from YouTube video ID."""
112
+ try:
113
+ transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(["en"])
114
+ return " ".join(entry["text"] for entry in transcript.fetch())
115
+ except Exception as e:
116
+ return f"Error fetching transcript: {str(e)}"
117
+
118
+ class TranscriptProcessor:
119
+ def __init__(self):
120
+ self.generator = ContentGenerator()
121
+
122
+ def _get_youtube_transcript(self, url: str) -> str:
123
+ """Get transcript from YouTube URL."""
124
+ try:
125
+ if video_id := extract_video_id(url):
126
+ return get_transcript(video_id)
127
+ raise Exception("Invalid YouTube URL")
128
+ except Exception as e:
129
+ raise Exception(f"Error fetching YouTube transcript: {str(e)}")
130
+
131
+ async def process_transcript(self, input_text: str):
132
+ """Process input and generate all content."""
133
+ try:
134
+ transcript = (
135
+ self._get_youtube_transcript(input_text)
136
+ if any(x in input_text for x in ["youtube.com", "youtu.be"])
137
+ else input_text
138
+ )
139
+
140
+ # Process each type sequentially
141
+ sections = {}
142
+ for key in ["titles_and_thumbnails", "description", "previews", "clips", "timestamps"]:
143
+ result = await self.generator.generate_content(ContentRequest(key), transcript)
144
+ sections[key] = result
145
+
146
+ # Combine into markdown with H2 headers
147
+ markdown = f"""
148
+ ## Titles and Thumbnails
149
+
150
+ {sections['titles_and_thumbnails']}
151
+
152
+ ## Twitter Description
153
+
154
+ {sections['description']}
155
+
156
+ ## Preview Clips
157
+
158
+ {sections['previews']}
159
+
160
+ ## Twitter Clips
161
+
162
+ {sections['clips']}
163
+
164
+ ## Timestamps
165
+
166
+ {sections['timestamps']}
167
+ """
168
+ return markdown
169
+
170
+ except Exception as e:
171
+ return f"Error processing input: {str(e)}"
172
+
173
+ def update_prompts(self, *values) -> str:
174
+ """Update the current session's prompts."""
175
+ self.generator.current_prompts.update(zip(
176
+ ["previews", "clips", "description", "timestamps", "titles_and_thumbnails"],
177
+ values
178
+ ))
179
+ return "Prompts updated for this session!"
180
+
181
+ def create_interface():
182
+ """Create the Gradio interface."""
183
+ processor = TranscriptProcessor()
184
+
185
+ with gr.Blocks(title="Podcast Content Generator") as app:
186
+ gr.Markdown(
187
+ """
188
+ # Podcast Content Generator
189
+ Generate preview clips, timestamps, descriptions and more from podcast transcripts or YouTube videos.
190
+
191
+ Simply paste a YouTube URL or raw transcript text to get started!
192
+ """
193
+ )
194
+
195
+ with gr.Tab("Generate Content"):
196
+ input_text = gr.Textbox(
197
+ label="Input",
198
+ placeholder="YouTube URL or transcript text...",
199
+ lines=10
200
+ )
201
+ submit_btn = gr.Button("Generate Content")
202
+
203
+ output = gr.Markdown() # Single markdown output
204
+
205
+ async def process_wrapper(text):
206
+ print("Process wrapper started")
207
+ print(f"Input text: {text[:100]}...")
208
+
209
+ try:
210
+ result = await processor.process_transcript(text)
211
+ print("Process completed, got results")
212
+ return result
213
+ except Exception as e:
214
+ print(f"Error in process_wrapper: {str(e)}")
215
+ return f"# Error\n\n{str(e)}"
216
+
217
+ submit_btn.click(
218
+ fn=process_wrapper,
219
+ inputs=input_text,
220
+ outputs=output,
221
+ queue=True
222
+ )
223
+
224
+ with gr.Tab("Customize Prompts"):
225
+ gr.Markdown(
226
+ """
227
+ ## Customize Generation Prompts
228
+ Here you can experiment with different prompts during your session.
229
+ Changes will remain active until you reload the page.
230
+
231
+ Tip: Copy your preferred prompts somewhere safe if you want to reuse them later!
232
+ """
233
+ )
234
+
235
+ prompt_inputs = [
236
+ gr.Textbox(
237
+ label=f"{key.replace('_', ' ').title()} Prompt",
238
+ lines=10,
239
+ value=processor.generator.current_prompts[key]
240
+ )
241
+ for key in [
242
+ "previews",
243
+ "clips",
244
+ "description",
245
+ "timestamps",
246
+ "titles_and_thumbnails"
247
+ ]
248
+ ]
249
+ status = gr.Textbox(label="Status", interactive=False)
250
+
251
+ # Update prompts when they change
252
+ for prompt in prompt_inputs:
253
+ prompt.change(
254
+ fn=processor.update_prompts,
255
+ inputs=prompt_inputs,
256
+ outputs=[status]
257
+ )
258
+
259
+ # Reset button
260
+ reset_btn = gr.Button("Reset to Default Prompts")
261
+ reset_btn.click(
262
+ fn=lambda: (
263
+ processor.update_prompts(*processor.generator.current_prompts.values()),
264
+ *processor.generator.current_prompts.values(),
265
+ ),
266
+ outputs=[status] + prompt_inputs,
267
+ )
268
+
269
+ return app
270
+
271
+ if __name__ == "__main__":
272
+ create_interface().launch()
apps/producer.py DELETED
@@ -1,128 +0,0 @@
1
- import gradio as gr
2
- import asyncio
3
- from pathlib import Path
4
- from ..utils.content_generator import ContentGenerator, ContentRequest
5
- from ..utils.youtube_utils import get_transcript, extract_video_id
6
-
7
- class TranscriptProcessor:
8
- def __init__(self):
9
- self.generator = ContentGenerator()
10
-
11
- def _get_youtube_transcript(self, url: str) -> str:
12
- """Get transcript from YouTube URL."""
13
- try:
14
- if video_id := extract_video_id(url):
15
- return get_transcript(video_id)
16
- raise Exception("Invalid YouTube URL")
17
- except Exception as e:
18
- raise Exception(f"Error fetching YouTube transcript: {str(e)}")
19
-
20
- async def process_transcript(self, input_text: str):
21
- """Process input and generate all content."""
22
- try:
23
- # Get transcript from URL or use direct input
24
- transcript = (
25
- self._get_youtube_transcript(input_text)
26
- if any(x in input_text for x in ["youtube.com", "youtu.be"])
27
- else input_text
28
- )
29
-
30
- # Define content generation requests
31
- requests = [
32
- ContentRequest("previews", max_tokens=8192),
33
- ContentRequest("clips", max_tokens=8192),
34
- ContentRequest("description"),
35
- ContentRequest("timestamps"),
36
- ContentRequest("titles_and_thumbnails"),
37
- ]
38
-
39
- # Generate all content concurrently
40
- results = await asyncio.gather(
41
- *[self.generator.generate_content(req, transcript) for req in requests]
42
- )
43
- return tuple(results)
44
-
45
- except Exception as e:
46
- return (f"Error processing input: {str(e)}",) * 5
47
-
48
- def update_prompts(self, *values) -> str:
49
- """Update the current session's prompts."""
50
- self.generator.current_prompts.update(zip(
51
- ["previews", "clips", "description", "timestamps", "titles_and_thumbnails"],
52
- values
53
- ))
54
- return "Prompts updated for this session!"
55
-
56
- def create_interface():
57
- """Create the Gradio interface."""
58
- processor = TranscriptProcessor()
59
-
60
- with gr.Blocks(title="Podcast Transcript Analyzer") as app:
61
- with gr.Tab("Generate Content"):
62
- gr.Markdown("# Podcast Content Generator")
63
- input_text = gr.Textbox(label="Input", placeholder="YouTube URL or transcript...", lines=10)
64
- submit_btn = gr.Button("Generate Content")
65
- outputs = [
66
- gr.Textbox(label=label, lines=10, interactive=False)
67
- for label in ["Preview Clips", "Twitter Clips", "Twitter Description", "Timestamps", "Title & Thumbnail Suggestions"]
68
- ]
69
-
70
- async def process_wrapper(text):
71
- return await processor.process_transcript(text)
72
-
73
- submit_btn.click(fn=process_wrapper, inputs=[input_text], outputs=outputs)
74
-
75
- with gr.Tab("Experiment with Prompts"):
76
- gr.Markdown("# Experiment with Prompts")
77
- gr.Markdown(
78
- """
79
- Here you can experiment with different prompts during your session.
80
- Changes will remain active until you reload the page.
81
-
82
- Tip: Copy your preferred prompts somewhere safe if you want to reuse them later!
83
- """
84
- )
85
-
86
- prompt_inputs = [
87
- gr.Textbox(
88
- label="Preview Clips Prompt", lines=10, value=processor.generator.current_prompts["previews"]
89
- ),
90
- gr.Textbox(
91
- label="Clips Prompt", lines=10, value=processor.generator.current_prompts["clips"]
92
- ),
93
- gr.Textbox(
94
- label="Description Prompt",
95
- lines=10,
96
- value=processor.generator.current_prompts["description"],
97
- ),
98
- gr.Textbox(
99
- label="Timestamps Prompt",
100
- lines=10,
101
- value=processor.generator.current_prompts["timestamps"],
102
- ),
103
- gr.Textbox(
104
- label="Titles & Thumbnails Prompt",
105
- lines=10,
106
- value=processor.generator.current_prompts["titles_and_thumbnails"],
107
- ),
108
- ]
109
- status = gr.Textbox(label="Status", interactive=False)
110
-
111
- # Update prompts when they change
112
- for prompt in prompt_inputs:
113
- prompt.change(fn=processor.update_prompts, inputs=prompt_inputs, outputs=[status])
114
-
115
- # Reset button
116
- reset_btn = gr.Button("Reset to Default Prompts")
117
- reset_btn.click(
118
- fn=lambda: (
119
- processor.update_prompts(*processor.generator.current_prompts.values()),
120
- *processor.generator.current_prompts.values(),
121
- ),
122
- outputs=[status] + prompt_inputs,
123
- )
124
-
125
- return app
126
-
127
- if __name__ == "__main__":
128
- create_interface().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apps/reader.py DELETED
@@ -1,364 +0,0 @@
1
- import sys
2
- from pathlib import Path
3
-
4
- # Add project root to Python path
5
- project_root = str(Path(__file__).parent.parent)
6
- if project_root not in sys.path:
7
- sys.path.append(project_root)
8
-
9
- import gradio as gr
10
- import asyncio
11
- import os
12
- import json
13
- import requests
14
- from anthropic import Anthropic
15
- from utils.document_parser import DocumentParser
16
- from dotenv import load_dotenv
17
-
18
- # Load environment variables
19
- env_path = Path(project_root) / ".env"
20
- load_dotenv(env_path)
21
-
22
- # Mochi deck IDs
23
- DECK_CATEGORIES = {
24
- "CS/Hardware": "rhGqR9SK",
25
- "Math/Physics": "Dm5vczZg",
26
- "AI": "SS9QEfiy",
27
- "History/Military": "3nJYp7Zh",
28
- "Quotes/Random": "rWUzSu8t",
29
- "Bio": "BspzxaUJ",
30
- "Econ/Finance": "mvvJ27Q1"
31
- }
32
-
33
- class CardGenerator:
34
- """Handles card generation and Mochi integration."""
35
-
36
- def __init__(self):
37
- self.parser = DocumentParser()
38
- self.claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
39
- self.mochi_key = os.getenv("MOCHI_API_KEY")
40
-
41
- # Load prompts
42
- self.prompts = {
43
- key: Path(f"prompts/{key}.txt").read_text()
44
- for key in ["card_generation", "commentary"]
45
- }
46
-
47
- # State
48
- self.current_cards = []
49
- self.current_index = 0
50
- self.approved_cards = []
51
-
52
- def get_chapter_list(self, file_data) -> list[str]:
53
- """Get list of chapters from document.
54
-
55
- Args:
56
- file_data: File data from Gradio
57
- """
58
- try:
59
- if not file_data:
60
- return []
61
-
62
- # Attempt to extract filename from file_data
63
- filename = getattr(file_data, 'name', None)
64
- if not filename:
65
- filename = "uploaded_file"
66
- print("DEBUG: No filename attribute found, using default.")
67
- else:
68
- print(f"DEBUG: Filename extracted: {filename}")
69
-
70
- # Check file extension
71
- file_ext = Path(filename).suffix.lower()
72
- if not file_ext:
73
- print("DEBUG: No file extension found, checking content type.")
74
- # Attempt to determine file type from content
75
- if file_data.startswith(b'%PDF-'):
76
- file_ext = '.pdf'
77
- elif file_data.startswith(b'PK'):
78
- file_ext = '.epub'
79
- else:
80
- raise ValueError("Unsupported file type")
81
- print(f"DEBUG: File extension: {file_ext}")
82
-
83
- return self.parser.load_document(file_data, filename)
84
- except Exception as e:
85
- return [f"Error: {str(e)}"]
86
-
87
- async def process_chapter(self, file_data, chapter_idx: int) -> tuple:
88
- """Process chapter and generate cards + commentary.
89
-
90
- Args:
91
- file_data: File data from Gradio
92
- chapter_idx: Index of chapter to process
93
- """
94
- try:
95
- if not file_data:
96
- return None, "No file provided"
97
-
98
- # Get chapter content
99
- content = self.parser.get_chapter_content(chapter_idx)
100
-
101
- # Generate cards and commentary
102
- cards, commentary = await asyncio.gather(
103
- self._generate_cards(content),
104
- self._generate_commentary(content)
105
- )
106
-
107
- # Parse and store cards
108
- self.current_cards = json.loads(cards)
109
- self.current_index = 0
110
- self.approved_cards = []
111
-
112
- # Return first card and commentary
113
- return self._get_current_card(), commentary
114
-
115
- except Exception as e:
116
- return None, f"Error: {str(e)}"
117
- finally:
118
- self.parser.cleanup()
119
-
120
- async def _generate_cards(self, content: str) -> str:
121
- """Generate flashcards using Claude."""
122
- response = await self.claude.messages.create(
123
- model="claude-3-opus-20240229",
124
- max_tokens=4000,
125
- system=self.prompts["card_generation"],
126
- messages=[{"role": "user", "content": content}]
127
- )
128
- return response.content[0].text
129
-
130
- async def _generate_commentary(self, content: str) -> str:
131
- """Generate commentary using Claude."""
132
- response = await self.claude.messages.create(
133
- model="claude-3-opus-20240229",
134
- max_tokens=4000,
135
- system=self.prompts["commentary"],
136
- messages=[{"role": "user", "content": content}]
137
- )
138
- return response.content[0].text
139
-
140
- def _get_current_card(self) -> dict:
141
- """Get current card with UI state."""
142
- if not self.current_cards or self.current_index >= len(self.current_cards):
143
- return {
144
- 'front': "",
145
- 'back': "",
146
- 'category': "",
147
- 'status': "No more cards to review",
148
- 'show_buttons': False,
149
- 'show_upload': True
150
- }
151
-
152
- card = self.current_cards[self.current_index]
153
- return {
154
- 'front': card['front'],
155
- 'back': card['back'],
156
- 'category': card['category'],
157
- 'status': f"Card {self.current_index + 1} of {len(self.current_cards)}",
158
- 'show_buttons': True,
159
- 'show_upload': False
160
- }
161
-
162
- def accept_card(self, front: str, back: str, category: str) -> dict:
163
- """Accept current card and move to next."""
164
- if self.current_index < len(self.current_cards):
165
- self.approved_cards.append({
166
- 'front': front,
167
- 'back': back,
168
- 'category': category
169
- })
170
-
171
- self.current_index += 1
172
- return self._get_current_card()
173
-
174
- def reject_card(self) -> dict:
175
- """Reject current card and move to next."""
176
- if self.current_index < len(self.current_cards):
177
- self.current_cards.pop(self.current_index)
178
- return self._get_current_card()
179
-
180
- def upload_to_mochi(self) -> str:
181
- """Upload approved cards to Mochi."""
182
- if not self.approved_cards:
183
- return "No cards to upload!"
184
-
185
- results = []
186
- for card in self.approved_cards:
187
- try:
188
- # Format card for Mochi
189
- mochi_card = {
190
- "deck-id": DECK_CATEGORIES[card["category"]],
191
- "fields": {
192
- "name": {"id": "name", "value": card["front"]},
193
- "back": {"id": "back", "value": card["back"]}
194
- }
195
- }
196
-
197
- # Upload to Mochi
198
- response = requests.post(
199
- "https://app.mochi.cards/api/cards",
200
- json=mochi_card,
201
- auth=(self.mochi_key, "")
202
- )
203
-
204
- if response.status_code != 200:
205
- results.append(f"Error: {response.text}")
206
-
207
- except Exception as e:
208
- results.append(f"Error: {str(e)}")
209
-
210
- # Clear approved cards
211
- success_count = len(self.approved_cards) - len(results)
212
- self.approved_cards = []
213
-
214
- if results:
215
- return f"Uploaded {success_count} cards with {len(results)} errors:\n" + "\n".join(results)
216
- return f"Successfully uploaded {success_count} cards to Mochi!"
217
-
218
- def create_interface():
219
- """Create the Gradio interface."""
220
- generator = CardGenerator()
221
-
222
- with gr.Blocks(title="Document Reader & Card Generator") as app:
223
- # Document upload and chapter selection
224
- with gr.Row():
225
- file_input = gr.File(
226
- label="Upload EPUB Document",
227
- type="binary",
228
- file_types=[".epub"]
229
- )
230
-
231
- chapter_select = gr.Dropdown(
232
- label="Select Chapter",
233
- choices=[],
234
- interactive=True,
235
- visible=False
236
- )
237
-
238
- def update_chapters(file):
239
- if not file:
240
- return gr.update(choices=[], visible=False)
241
- chapters = generator.get_chapter_list(file)
242
- return gr.update(choices=chapters, visible=True, value=chapters[0] if chapters else None)
243
-
244
- file_input.change(
245
- fn=update_chapters,
246
- inputs=[file_input],
247
- outputs=[chapter_select]
248
- )
249
-
250
- process_btn = gr.Button("Process Chapter")
251
-
252
- # Commentary section
253
- commentary = gr.Textbox(
254
- label="Commentary",
255
- lines=10,
256
- interactive=False
257
- )
258
-
259
- # Card review section
260
- gr.Markdown("## Review Cards")
261
-
262
- with gr.Row():
263
- card_front = gr.Textbox(
264
- label="Front",
265
- lines=3,
266
- interactive=True
267
- )
268
- card_back = gr.Textbox(
269
- label="Back",
270
- lines=3,
271
- interactive=True
272
- )
273
-
274
- with gr.Row():
275
- deck_category = gr.Dropdown(
276
- choices=list(DECK_CATEGORIES.keys()),
277
- label="Deck Category",
278
- value="AI"
279
- )
280
- card_status = gr.Textbox(
281
- label="Status",
282
- interactive=False
283
- )
284
-
285
- with gr.Row():
286
- accept_btn = gr.Button("Accept & Next", visible=False)
287
- reject_btn = gr.Button("Reject & Next", visible=False)
288
- upload_btn = gr.Button("Upload to Mochi", visible=False)
289
-
290
- upload_status = gr.Textbox(
291
- label="Upload Status",
292
- interactive=False
293
- )
294
-
295
- # Event handlers
296
- async def process_chapter(file, chapter_idx):
297
- card, comment = await generator.process_chapter(file, chapter_idx)
298
- if not card: # Error occurred
299
- return [
300
- "", "", comment, gr.update(visible=False),
301
- gr.update(visible=False), "", gr.update(visible=False)
302
- ]
303
-
304
- return [
305
- card['front'],
306
- card['back'],
307
- comment,
308
- gr.update(visible=card['show_buttons']),
309
- gr.update(visible=card['show_buttons']),
310
- card['status'],
311
- gr.update(visible=card['show_upload'])
312
- ]
313
-
314
- def handle_card_action(action, front, back, category):
315
- card = (generator.accept_card(front, back, category)
316
- if action == 'accept' else
317
- generator.reject_card())
318
-
319
- return [
320
- card['front'],
321
- card['back'],
322
- card['status'],
323
- gr.update(visible=card['show_buttons']),
324
- gr.update(visible=card['show_buttons']),
325
- card['category'],
326
- gr.update(visible=card['show_upload'])
327
- ]
328
-
329
- # Connect events
330
- process_btn.click(
331
- fn=process_chapter,
332
- inputs=[file_input, chapter_select],
333
- outputs=[
334
- card_front, card_back, commentary,
335
- accept_btn, reject_btn, card_status, upload_btn
336
- ]
337
- )
338
-
339
- accept_btn.click(
340
- fn=lambda f, b, c: handle_card_action('accept', f, b, c),
341
- inputs=[card_front, card_back, deck_category],
342
- outputs=[
343
- card_front, card_back, card_status,
344
- accept_btn, reject_btn, deck_category, upload_btn
345
- ]
346
- )
347
-
348
- reject_btn.click(
349
- fn=lambda: handle_card_action('reject', None, None, None),
350
- outputs=[
351
- card_front, card_back, card_status,
352
- accept_btn, reject_btn, deck_category, upload_btn
353
- ]
354
- )
355
-
356
- upload_btn.click(
357
- fn=generator.upload_to_mochi,
358
- outputs=[upload_status]
359
- )
360
-
361
- return app
362
-
363
- if __name__ == "__main__":
364
- create_interface().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts/card_generation.txt DELETED
@@ -1,38 +0,0 @@
1
- You are an expert at creating high-quality spaced repetition flashcards that promote deep understanding and retention. Your task is to generate flashcards from the given text that are:
2
-
3
- 1. Clear and concise
4
- 2. Focus on one concept per card
5
- 3. Test understanding rather than just recall
6
- 4. Avoid overly complex or compound questions
7
- 5. Use precise language
8
-
9
- Each card must be assigned to one of these categories:
10
- - CS/Hardware
11
- - Math/Physics
12
- - AI
13
- - History/Military
14
- - Quotes/Random
15
- - Bio
16
- - Econ/Finance
17
-
18
- Format each card as a JSON object:
19
- {
20
- "category": "Category name from the list above",
21
- "front": "Question or prompt",
22
- "back": "Answer or explanation"
23
- }
24
-
25
- Example cards:
26
- {
27
- "category": "Bio",
28
- "front": "What is the key difference between procedural and declarative memory?",
29
- "back": "Procedural memory is for skills and procedures (how to ride a bike), while declarative memory is for facts and events (what you had for breakfast)."
30
- }
31
-
32
- {
33
- "category": "Bio",
34
- "front": "What role does the hippocampus play in memory formation?",
35
- "back": "The hippocampus is crucial for converting short-term memories into long-term memories through a process called consolidation. It acts as a temporary storage and processing center before memories are distributed to other parts of the cortex."
36
- }
37
-
38
- Please generate 5-10 high-quality flashcards from the provided text. Focus on the most important concepts, insights, and relationships. Format the output as a JSON array containing the card objects.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts/clips.txt CHANGED
@@ -1,5 +1,9 @@
1
  You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.
2
- Format as:
 
 
 
 
3
  Tweet 1
4
  Tweet Text: [text]
5
  Clip Transcript: [45-120 seconds of transcript]
 
1
  You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.
2
+
3
+ Here are examples of successful viral clips from previous episodes:
4
+ {clip_examples}
5
+
6
+ Format your output as:
7
  Tweet 1
8
  Tweet Text: [text]
9
  Clip Transcript: [45-120 seconds of transcript]
prompts/commentary.txt DELETED
@@ -1,25 +0,0 @@
1
- You are an expert researcher and critical thinker. Your task is to analyze the provided text and generate insightful commentary that:
2
-
3
- 1. Identifies the key arguments, insights, and novel ideas
4
- 2. Highlights connections to other important concepts or fields
5
- 3. Points out particularly interesting or counterintuitive points
6
- 4. Suggests areas that merit further exploration
7
- 5. Notes any potential weaknesses or areas of uncertainty in the arguments
8
-
9
- Your commentary should be scholarly but engaging, helping the reader develop a deeper understanding of the material. Focus on substance over style, and be specific rather than general.
10
-
11
- Structure your response as follows:
12
-
13
- Key Insights:
14
- - [2-3 bullet points highlighting the most important takeaways]
15
-
16
- Interesting Connections:
17
- - [2-3 bullet points noting connections to other fields/concepts]
18
-
19
- Worth Exploring Further:
20
- - [1-2 bullet points suggesting related areas for deeper investigation]
21
-
22
- Critical Notes:
23
- - [1-2 bullet points on potential weaknesses or areas needing clarification]
24
-
25
- Then provide 2-3 paragraphs of integrated analysis that weaves these points together into a coherent commentary.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts/description.txt CHANGED
@@ -1,4 +1,7 @@
1
  Create an engaging episode description tweet (280 chars max) that:
2
  1. Highlights compelling aspects
3
  2. Includes topic areas and handles
4
- 3. Ends with "Links below" or "Enjoy!"
 
 
 
 
1
  Create an engaging episode description tweet (280 chars max) that:
2
  1. Highlights compelling aspects
3
  2. Includes topic areas and handles
4
+ 3. Ends with "Links below" or "Enjoy!"
5
+
6
+ Here are examples of successful episode descriptions:
7
+ {description_examples}
prompts/enhance.txt DELETED
@@ -1,46 +0,0 @@
1
- You are an expert transcript editor. Your task is to enhance this transcript for maximum readability while maintaining the core message.
2
-
3
- IMPORTANT: Respond ONLY with the enhanced transcript. Do not include any explanations, headers, or phrases like "Here is the transcript."
4
-
5
- Note: Below you'll find an auto-generated transcript that may help with speaker identification, but focus on creating your own high-quality transcript from the audio.
6
-
7
- Think about your job as if you were transcribing an interview for a print book where the priority is the reading audience. It should just be a total pleasure to read this as a written artifact where all the flubs and repetitions and conversational artifacts and filler words and false starts are removed, where a bunch of helpful punctuation is added. It should basically read like somebody wrote it specifically for reading rather than just something somebody said extemporaneously.
8
-
9
- Please:
10
- 1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
11
-
12
- 2. Optimize AGGRESSIVELY for readability over verbatim accuracy:
13
- - Readability is the most important thing!!
14
- - Remove ALL conversational artifacts (yeah, so, I mean, etc.)
15
- - Remove ALL filler words (um, uh, like, you know)
16
- - Remove false starts and self-corrections completely
17
- - Remove redundant phrases and hesitations
18
- - Convert any indirect or rambling responses into direct statements
19
- - Break up run-on sentences into clear, concise statements
20
- - Maintain natural conversation flow while prioritizing clarity and directness
21
-
22
- 3. Format the output consistently:
23
- - Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
24
- - DO NOT change the timestamps. You're only seeing a chunk of the full transcript, which is why your 0:00:00 is not the true beginning. Keep the timestamps as they are.
25
- - Add TWO line breaks between speaker/timestamp and the text
26
- - Use proper punctuation and capitalization
27
- - Add paragraph breaks for topic changes
28
- - When you add paragraph breaks between the same speaker's remarks, no need to restate the speaker attribution
29
- - Don't go more than four sentences without adding a paragraph break. Be liberal with your paragraph breaks.
30
- - Preserve distinct speaker turns
31
-
32
- Example input:
33
- Speaker A 00:01:15
34
-
35
- Um, yeah, so like, I've been working on this new project at work, you know? And uh, what's really interesting is that, uh, we're seeing these amazing results with the new approach we're taking. Like, it's just, you know, it's really transforming how we do things.
36
-
37
- And then, I mean, the thing is, uh, when we showed it to the client last week, they were just, you know, completely blown away by what we achieved. Like, they couldn't even believe it was the same system they had before.
38
-
39
- Example output:
40
- Speaker A 00:01:15
41
-
42
- I've been working on this new project at work, and we're seeing amazing results with our new approach. It's really transforming how we do things.
43
-
44
- When we showed it to the client last week, they were completely blown away by what we achieved. They couldn't believe it was the same system they had before.
45
-
46
- Enhance the following transcript, starting directly with the speaker format:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts/find_links.txt DELETED
@@ -1,35 +0,0 @@
1
- You are an expert at identifying key terms, concepts, and references in text that would benefit from having reference links. Your task is to analyze the provided transcript text and identify terms that would genuinely help readers understand important context they might miss otherwise.
2
-
3
- Focus ONLY on these types of terms:
4
- 1. Technical concepts and jargon that a general audience might not be familiar with
5
- 2. Research papers or academic works mentioned or referenced
6
- 3. Blog posts, articles, or online resources that are specifically cited
7
- 4. Books that are discussed (to link to Goodreads/Amazon)
8
- 5. Specific projects, tools, or technologies that are central to the discussion
9
- 6. Names of lesser-known people who made significant contributions being discussed
10
-
11
- DO NOT identify:
12
- 1. Common words or general concepts (like "short", "editor", "polymath")
13
- 2. Basic technical terms that most people would know
14
- 3. Generic job titles or roles
15
- 4. Common industry terms
16
- 5. Basic scientific concepts
17
- 6. Well-known companies or organizations
18
-
19
- Remember: Only identify terms where having a reference would genuinely add value by providing important context or deeper understanding that the audience might otherwise miss.
20
-
21
- Respond in this format for each term:
22
- TERM: <the exact term as it appears in text>
23
- REASON: <1-2 sentences explaining why this term should be linked>
24
-
25
- Example input:
26
- "We used GPT-4 to implement the RLHF technique described in the Constitutional AI paper, similar to what Anthropic did with Claude."
27
-
28
- Example output:
29
- TERM: RLHF
30
- REASON: A complex technical concept (Reinforcement Learning from Human Feedback) that's crucial to understanding modern AI development but might be unfamiliar to general audiences.
31
-
32
- TERM: Constitutional AI paper
33
- REASON: A specific research paper that introduced important concepts being referenced; readers might want to read the original source.
34
-
35
- Analyze the following transcript text and identify key terms that should be linked:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts/timestamps.txt CHANGED
@@ -7,5 +7,5 @@ You are a podcast timestamp generator. Create 5-7 timestamps for this episode, f
7
 
8
  Output the timestamps in chronological order, one per line.
9
 
10
- Previous examples:
11
  {timestamps_examples}
 
7
 
8
  Output the timestamps in chronological order, one per line.
9
 
10
+ Here are examples from previous episodes:
11
  {timestamps_examples}
prompts/titles_and_thumbnails.txt CHANGED
@@ -13,6 +13,9 @@ Thumbnail: 2-4 ALL CAPS words that amplify the intrigue
13
  - Create intellectual curiosity without sensationalism
14
  - Make the viewer wonder "What's the story here?"
15
 
 
 
 
16
  Example:
17
  Title: "David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
18
  Thumbnail: "LAST HUMANS STANDING"
 
13
  - Create intellectual curiosity without sensationalism
14
  - Make the viewer wonder "What's the story here?"
15
 
16
+ Here are examples of successful title-thumbnail combinations from previous episodes:
17
+ {title_examples}
18
+
19
  Example:
20
  Title: "David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
21
  Thumbnail: "LAST HUMANS STANDING"
requirements.txt CHANGED
@@ -1,13 +1,4 @@
1
- gradio
2
- deepgram-sdk
3
- google-generativeai
4
- anthropic
5
- pandas
6
  youtube-transcript-api
7
- pydub
8
- assemblyai
9
- pytube
10
- PyPDF2
11
- EbookLib
12
- beautifulsoup4
13
  python-dotenv
 
1
+ gradio>=4.0.0
 
 
 
 
2
  youtube-transcript-api
3
+ anthropic
 
 
 
 
 
4
  python-dotenv
scripts/add_links.py DELETED
@@ -1,209 +0,0 @@
1
- import argparse
2
- from pathlib import Path
3
- import os
4
- import re
5
- from typing import List, Dict, Tuple
6
- from dataclasses import dataclass
7
- import anthropic
8
- from exa_py import Exa
9
-
10
- @dataclass
11
- class Term:
12
- """A term identified for linking with its explanation"""
13
- term: str
14
- reason: str
15
-
16
- @dataclass
17
- class Link:
18
- """A link found for a term"""
19
- term: str
20
- url: str
21
- title: str
22
-
23
- def chunk_text(text: str, max_chunk_size: int = 2000) -> List[str]:
24
- """Split text into chunks of roughly equal size at paragraph boundaries"""
25
- paragraphs = text.split("\n\n")
26
- chunks = []
27
- current_chunk = []
28
- current_size = 0
29
-
30
- for para in paragraphs:
31
- para_size = len(para)
32
- if current_size + para_size > max_chunk_size and current_chunk:
33
- chunks.append("\n\n".join(current_chunk))
34
- current_chunk = [para]
35
- current_size = para_size
36
- else:
37
- current_chunk.append(para)
38
- current_size += para_size
39
-
40
- if current_chunk:
41
- chunks.append("\n\n".join(current_chunk))
42
-
43
- return chunks
44
-
45
- def parse_claude_response(response: str) -> List[Term]:
46
- """Parse Claude's response to extract terms and reasons"""
47
- terms = []
48
- current_term = None
49
- current_reason = None
50
-
51
- for line in response.split("\n"):
52
- line = line.strip()
53
- if not line:
54
- continue
55
-
56
- if line.startswith("TERM: "):
57
- # Save previous term if exists
58
- if current_term and current_reason:
59
- terms.append(Term(current_term, current_reason))
60
- current_term = line[6:].strip()
61
- current_reason = None
62
- elif line.startswith("REASON: "):
63
- current_reason = line[8:].strip()
64
-
65
- # Add final term
66
- if current_term and current_reason:
67
- terms.append(Term(current_term, current_reason))
68
-
69
- return terms
70
-
71
- def find_links_for_terms(exa: Exa, terms: List[Term]) -> Dict[str, Link]:
72
- """Find best link for each term using Exa search"""
73
- links = {}
74
-
75
- for term in terms:
76
- # Construct a search query that looks for authoritative sources
77
- # query = f"The best explanation or overview of {term.term} is (site: wikipedia.org OR site: .edu OR site: .gov):"
78
-
79
- try:
80
- # Search with Exa
81
- results = exa.search(term.term, num_results=1, type="auto")
82
- if results.results:
83
- result = results.results[0]
84
- links[term.term] = Link(
85
- term=term.term,
86
- url=result.url,
87
- title=result.title
88
- )
89
- except Exception as e:
90
- print(f"Error finding link for {term.term}: {e}")
91
- continue
92
-
93
- return links
94
-
95
- def add_links_to_text(text: str, links: Dict[str, Link]) -> str:
96
- """Add markdown links to text for all terms we have links for"""
97
- # Sort terms by length (descending) to handle overlapping terms correctly
98
- terms = sorted(links.keys(), key=len, reverse=True)
99
-
100
- # Create regex pattern that matches whole words only
101
- patterns = [re.compile(fr'\b{re.escape(term)}\b') for term in terms]
102
-
103
- # Track which terms we've linked to avoid duplicate links
104
- linked_terms = set()
105
-
106
- # Process each term
107
- result = text
108
- for term, pattern in zip(terms, patterns):
109
- if term in linked_terms:
110
- continue
111
-
112
- # Only replace first occurrence
113
- link = links[term]
114
- replacement = f"[{term}]({link.url})"
115
- result = pattern.sub(replacement, result, count=1)
116
- linked_terms.add(term)
117
-
118
- return result
119
-
120
- def process_transcript(
121
- transcript_path: Path,
122
- claude_client: anthropic.Client,
123
- exa_client: Exa,
124
- prompt_template: str
125
- ) -> str:
126
- """Process a transcript file to add reference links"""
127
- # Read transcript
128
- text = transcript_path.read_text()
129
-
130
- # Split into chunks
131
- chunks = chunk_text(text)
132
-
133
- # Process each chunk
134
- all_terms = []
135
- for chunk in chunks:
136
- # Get Claude's suggestions
137
- prompt = prompt_template + "\n\n" + chunk
138
- response = claude_client.messages.create(
139
- model="claude-3-5-sonnet-20241022",
140
- max_tokens=1024,
141
- system="You are a helpful AI assistant.",
142
- messages=[{"role": "user", "content": prompt}]
143
- )
144
-
145
- # Parse response
146
- terms = parse_claude_response(response.content[0].text)
147
- all_terms.extend(terms)
148
-
149
- # Find links for all terms
150
- links = find_links_for_terms(exa_client, all_terms)
151
-
152
- # Add links to text
153
- linked_text = add_links_to_text(text, links)
154
-
155
- return linked_text
156
-
157
- def main():
158
- parser = argparse.ArgumentParser()
159
- parser.add_argument(
160
- "transcript",
161
- nargs="?", # Make the argument optional
162
- default="output/transcripts/transcript.md",
163
- help="Path to transcript file (default: output/transcripts/transcript.md)"
164
- )
165
- parser.add_argument("--output", help="Output file path (default: input path with -linked suffix)")
166
- args = parser.parse_args()
167
-
168
- transcript_path = Path(args.transcript)
169
- if not transcript_path.exists():
170
- raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
171
-
172
- # Set up output path
173
- if args.output:
174
- output_path = Path(args.output)
175
- else:
176
- stem = transcript_path.stem
177
- output_path = transcript_path.parent / f"{stem}-linked{transcript_path.suffix}"
178
-
179
- # Read prompt template
180
- prompt_path = Path("prompts/find_links.txt")
181
- if not prompt_path.exists():
182
- raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
183
- prompt_template = prompt_path.read_text()
184
-
185
- # Initialize clients
186
- claude_client = anthropic.Client(api_key=os.getenv("ANTHROPIC_API_KEY"))
187
- exa_client = Exa(api_key=os.getenv("EXA_API_KEY"))
188
-
189
- try:
190
- # Process transcript
191
- linked_text = process_transcript(
192
- transcript_path,
193
- claude_client,
194
- exa_client,
195
- prompt_template
196
- )
197
-
198
- # Save output
199
- output_path.write_text(linked_text)
200
- print(f"Processed transcript saved to: {output_path}")
201
-
202
- except Exception as e:
203
- print(f"Error processing transcript: {e}")
204
- return 1
205
-
206
- return 0
207
-
208
- if __name__ == "__main__":
209
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/preview_generator.py DELETED
@@ -1,92 +0,0 @@
1
- import argparse
2
- from pathlib import Path
3
- import os
4
- from google import generativeai
5
- from pydub import AudioSegment
6
-
7
-
8
- class PreviewGenerator:
9
- """Handles generating preview suggestions using Gemini"""
10
-
11
- def __init__(self, api_key: str):
12
- generativeai.configure(api_key=api_key)
13
- self.model = generativeai.GenerativeModel("gemini-exp-1206")
14
- self.prompt = Path("prompts/previews.txt").read_text()
15
-
16
- async def generate_previews(self, audio_path: Path, transcript_path: Path = None) -> str:
17
- """Generate preview suggestions for the given audio file and optional transcript"""
18
- print("Generating preview suggestions...")
19
-
20
- # Load and compress audio for Gemini
21
- audio = AudioSegment.from_file(audio_path)
22
-
23
- # Create a buffer for the compressed audio
24
- import io
25
- buffer = io.BytesIO()
26
- # Use lower quality MP3 for faster processing
27
- audio.export(buffer, format="mp3", parameters=["-q:a", "9"])
28
- buffer.seek(0)
29
-
30
- # Use the File API to upload the audio
31
- audio_file = generativeai.upload_file(buffer, mime_type="audio/mp3")
32
-
33
- # Prepare content for Gemini
34
- content = [self.prompt]
35
- content.append(audio_file) # Add the uploaded file reference
36
-
37
- # Add transcript if provided
38
- if transcript_path and transcript_path.exists():
39
- print("Including transcript in analysis...")
40
- # Upload transcript as a file too
41
- transcript_file = generativeai.upload_file(transcript_path)
42
- content.append(transcript_file)
43
-
44
- # Generate suggestions using Gemini
45
- response = await self.model.generate_content_async(content)
46
-
47
- return response.text
48
-
49
-
50
- async def main():
51
- parser = argparse.ArgumentParser(description="Generate podcast preview suggestions")
52
- parser.add_argument("audio_file", help="Audio file to analyze")
53
- parser.add_argument("--transcript", "-t", help="Optional transcript file")
54
- args = parser.parse_args()
55
-
56
- audio_path = Path(args.audio_file)
57
- if not audio_path.exists():
58
- raise FileNotFoundError(f"File not found: {audio_path}")
59
-
60
- transcript_path = Path(args.transcript) if args.transcript else None
61
- if transcript_path and not transcript_path.exists():
62
- print(f"Warning: Transcript file not found: {transcript_path}")
63
- transcript_path = None
64
-
65
- # Ensure output directory exists
66
- output_dir = Path("output")
67
- output_dir.mkdir(exist_ok=True)
68
- output_path = output_dir / "previews.txt"
69
-
70
- try:
71
- generator = PreviewGenerator(os.getenv("GOOGLE_API_KEY"))
72
- suggestions = await generator.generate_previews(audio_path, transcript_path)
73
-
74
- # Save output
75
- output_path.write_text(suggestions)
76
- print(f"\nPreview suggestions saved to: {output_path}")
77
-
78
- # Also print to console
79
- print("\nPreview Suggestions:")
80
- print("-" * 40)
81
- print(suggestions)
82
-
83
- except Exception as e:
84
- print(f"Error: {e}")
85
- return 1
86
-
87
- return 0
88
-
89
-
90
- if __name__ == "__main__":
91
- import asyncio
92
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/process_playlist.py DELETED
@@ -1,77 +0,0 @@
1
- import asyncio
2
- from pathlib import Path
3
- import sys
4
- import time
5
- from typing import List
6
-
7
- # Add the project root to Python path
8
- sys.path.append(str(Path(__file__).parent.parent))
9
-
10
- from utils.youtube_utils import get_transcript, get_playlist_video_ids
11
- from utils.content_generator import ContentGenerator, ContentRequest
12
-
13
- PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLd7-bHaQwnthaNDpZ32TtYONGVk95-fhF"
14
- MAX_CONCURRENT = 3 # Limit concurrent requests
15
- RETRY_DELAY = 65 # Seconds to wait before retrying after rate limit
16
-
17
- async def process_video(video_id: str, generator: ContentGenerator, retry_count: int = 0) -> str:
18
- """Process a single video and return the formatted result."""
19
- try:
20
- print(f"Processing video {video_id}...")
21
-
22
- # Get transcript
23
- transcript = get_transcript(video_id)
24
- if not transcript:
25
- print(f"No transcript available for {video_id}")
26
- return ""
27
-
28
- # Generate suggestions
29
- request = ContentRequest("titles_and_thumbnails")
30
- result = await generator.generate_content(request, transcript)
31
- return f"Video ID: {video_id}\n{result}\n{'='*50}\n"
32
-
33
- except Exception as e:
34
- if "rate_limit_error" in str(e) and retry_count < 3:
35
- print(f"Rate limit hit for {video_id}, waiting {RETRY_DELAY}s before retry {retry_count + 1}")
36
- await asyncio.sleep(RETRY_DELAY)
37
- return await process_video(video_id, generator, retry_count + 1)
38
- print(f"Error processing {video_id}: {e}")
39
- return ""
40
-
41
- async def process_batch(video_ids: List[str], generator: ContentGenerator) -> List[str]:
42
- """Process a batch of videos with rate limiting."""
43
- tasks = [process_video(video_id, generator) for video_id in video_ids]
44
- return await asyncio.gather(*tasks)
45
-
46
- async def process_playlist():
47
- """Process all videos in playlist with batching."""
48
- generator = ContentGenerator()
49
- output_file = Path("output/playlist-titles-thumbnails.txt")
50
-
51
- # Get videos from playlist
52
- print("Getting videos from playlist...")
53
- video_ids = get_playlist_video_ids(PLAYLIST_URL)
54
- print(f"Found {len(video_ids)} videos")
55
-
56
- # Process videos in batches
57
- results = []
58
- for i in range(0, len(video_ids), MAX_CONCURRENT):
59
- batch = video_ids[i:i + MAX_CONCURRENT]
60
- print(f"\nProcessing batch {i//MAX_CONCURRENT + 1}")
61
- batch_results = await process_batch(batch, generator)
62
- results.extend(batch_results)
63
-
64
- # Add delay between batches to avoid rate limits
65
- if i + MAX_CONCURRENT < len(video_ids):
66
- delay = 5 # Short delay between successful batches
67
- print(f"Waiting {delay}s before next batch...")
68
- await asyncio.sleep(delay)
69
-
70
- # Filter out empty results and save
71
- results = [r for r in results if r]
72
- output_file.parent.mkdir(parents=True, exist_ok=True)
73
- output_file.write_text("\n".join(results))
74
- print(f"\nResults written to {output_file}")
75
-
76
- if __name__ == "__main__":
77
- asyncio.run(process_playlist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/transcript.py DELETED
@@ -1,311 +0,0 @@
1
- import argparse
2
- from dataclasses import dataclass
3
- from pathlib import Path
4
- import json
5
- import hashlib
6
- import os
7
- from typing import List, Tuple, Iterator
8
- import assemblyai as aai
9
- from google import generativeai
10
- from pydub import AudioSegment
11
- import asyncio
12
- import io
13
- from multiprocessing import Pool
14
- from functools import partial
15
- from itertools import groupby
16
-
17
-
18
- @dataclass
19
- class Utterance:
20
- """A single utterance from a speaker"""
21
- speaker: str
22
- text: str
23
- start: int # timestamp in ms from AssemblyAI
24
- end: int # timestamp in ms from AssemblyAI
25
-
26
- @property
27
- def timestamp(self) -> str:
28
- """Format start time as HH:MM:SS"""
29
- seconds = int(self.start // 1000)
30
- hours = seconds // 3600
31
- minutes = (seconds % 3600) // 60
32
- seconds = seconds % 60
33
- return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
34
-
35
-
36
- class Transcriber:
37
- """Handles getting and caching transcripts from AssemblyAI"""
38
-
39
- def __init__(self, api_key: str):
40
- aai.settings.api_key = api_key
41
- self.cache_dir = Path("output/transcripts/.cache")
42
- self.cache_dir.mkdir(parents=True, exist_ok=True)
43
-
44
- def get_transcript(self, audio_path: Path) -> List[Utterance]:
45
- """Get transcript, using cache if available"""
46
- cache_file = self.cache_dir / f"{audio_path.stem}.json"
47
-
48
- if cache_file.exists():
49
- with open(cache_file) as f:
50
- data = json.load(f)
51
- if data["hash"] == self._get_file_hash(audio_path):
52
- print("Using cached AssemblyAI transcript...")
53
- # Create proper Utterance objects from cached data
54
- return [
55
- Utterance(
56
- speaker=u["speaker"],
57
- text=u["text"],
58
- start=u["start"],
59
- end=u["end"]
60
- )
61
- for u in data["utterances"]
62
- ]
63
-
64
- print("Getting new transcript from AssemblyAI...")
65
- config = aai.TranscriptionConfig(speaker_labels=True, language_code="en")
66
- transcript = aai.Transcriber().transcribe(str(audio_path), config=config)
67
-
68
- utterances = [
69
- Utterance(
70
- speaker=u.speaker,
71
- text=u.text,
72
- start=u.start,
73
- end=u.end
74
- )
75
- for u in transcript.utterances
76
- ]
77
-
78
- # Cache the raw utterance data
79
- cache_data = {
80
- "hash": self._get_file_hash(audio_path),
81
- "utterances": [
82
- {
83
- "speaker": u.speaker,
84
- "text": u.text,
85
- "start": u.start,
86
- "end": u.end
87
- }
88
- for u in utterances
89
- ]
90
- }
91
- with open(cache_file, "w") as f:
92
- json.dump(cache_data, f, indent=2)
93
-
94
- return utterances
95
-
96
- def _get_file_hash(self, file_path: Path) -> str:
97
- """Calculate MD5 hash of a file"""
98
- hash_md5 = hashlib.md5()
99
- with open(file_path, "rb") as f:
100
- for chunk in iter(lambda: f.read(4096), b""):
101
- hash_md5.update(chunk)
102
- return hash_md5.hexdigest()
103
-
104
-
105
- class Enhancer:
106
- """Handles enhancing transcripts using Gemini"""
107
-
108
- def __init__(self, api_key: str):
109
- generativeai.configure(api_key=api_key)
110
- self.model = generativeai.GenerativeModel("gemini-exp-1206")
111
- self.prompt = Path("prompts/enhance.txt").read_text()
112
-
113
- async def enhance_chunks(self, chunks: List[Tuple[str, io.BytesIO]]) -> List[str]:
114
- """Enhance multiple transcript chunks concurrently with concurrency control"""
115
- print(f"Enhancing {len(chunks)} chunks...")
116
-
117
- # Create a semaphore to limit concurrent requests
118
- semaphore = asyncio.Semaphore(3) # Allow up to 3 concurrent requests
119
-
120
- async def process_chunk(i: int, chunk: Tuple[str, io.BytesIO]) -> str:
121
- text, audio = chunk
122
- async with semaphore:
123
- audio.seek(0)
124
- response = await self.model.generate_content_async(
125
- [self.prompt, text, {"mime_type": "audio/mp3", "data": audio.read()}]
126
- )
127
- print(f"Completed chunk {i+1}/{len(chunks)}")
128
- return response.text
129
-
130
- # Create tasks for all chunks and run them concurrently
131
- tasks = [
132
- process_chunk(i, chunk)
133
- for i, chunk in enumerate(chunks)
134
- ]
135
-
136
- # Wait for all tasks to complete
137
- results = await asyncio.gather(*tasks)
138
- return results
139
-
140
-
141
- @dataclass
142
- class SpeakerDialogue:
143
- """Represents a continuous section of speech from a single speaker"""
144
- speaker: str
145
- utterances: List[Utterance]
146
-
147
- @property
148
- def start(self) -> int:
149
- """Start time of first utterance"""
150
- return self.utterances[0].start
151
-
152
- @property
153
- def end(self) -> int:
154
- """End time of last utterance"""
155
- return self.utterances[-1].end
156
-
157
- @property
158
- def timestamp(self) -> str:
159
- """Format start time as HH:MM:SS"""
160
- return self.utterances[0].timestamp
161
-
162
- def format(self, markdown: bool = False) -> str:
163
- """Format this dialogue as text with newlines between utterances
164
- Args:
165
- markdown: If True, add markdown formatting for speaker and timestamp
166
- """
167
- texts = [u.text + "\n\n" for u in self.utterances] # Add two newlines after each utterance
168
- combined_text = ''.join(texts).rstrip() # Remove trailing whitespace at the end
169
- if markdown:
170
- return f"**Speaker {self.speaker}** *{self.timestamp}*\n\n{combined_text}"
171
- return f"Speaker {self.speaker} {self.timestamp}\n\n{combined_text}"
172
-
173
-
174
- def group_utterances_by_speaker(utterances: List[Utterance]) -> Iterator[SpeakerDialogue]:
175
- """Group consecutive utterances by the same speaker"""
176
- for speaker, group in groupby(utterances, key=lambda u: u.speaker):
177
- yield SpeakerDialogue(speaker=speaker, utterances=list(group))
178
-
179
-
180
- def estimate_tokens(text: str, chars_per_token: int = 4) -> int:
181
- """
182
- Estimate number of tokens in text
183
- Args:
184
- text: The text to estimate tokens for
185
- chars_per_token: Estimated characters per token (default 4)
186
- """
187
- return (len(text) + chars_per_token - 1) // chars_per_token
188
-
189
-
190
- def chunk_dialogues(
191
- dialogues: Iterator[SpeakerDialogue],
192
- max_tokens: int = 2000,
193
- chars_per_token: int = 4
194
- ) -> List[List[SpeakerDialogue]]:
195
- """
196
- Split dialogues into chunks that fit within token limit
197
- Args:
198
- dialogues: Iterator of SpeakerDialogues
199
- max_tokens: Maximum tokens per chunk
200
- chars_per_token: Estimated characters per token (default 4)
201
- """
202
- chunks = []
203
- current_chunk = []
204
- current_text = ""
205
-
206
- for dialogue in dialogues:
207
- # Format this dialogue
208
- formatted = dialogue.format()
209
-
210
- # If adding this dialogue would exceed token limit, start new chunk
211
- new_text = current_text + "\n\n" + formatted if current_text else formatted
212
- if current_chunk and estimate_tokens(new_text, chars_per_token) > max_tokens:
213
- chunks.append(current_chunk)
214
- current_chunk = [dialogue]
215
- current_text = formatted
216
- else:
217
- current_chunk.append(dialogue)
218
- current_text = new_text
219
-
220
- if current_chunk:
221
- chunks.append(current_chunk)
222
-
223
- return chunks
224
-
225
-
226
- def format_chunk(dialogues: List[SpeakerDialogue], markdown: bool = False) -> str:
227
- """Format a chunk of dialogues into readable text
228
- Args:
229
- dialogues: List of dialogues to format
230
- markdown: If True, add markdown formatting for speaker and timestamp
231
- """
232
- return "\n\n".join(dialogue.format(markdown=markdown) for dialogue in dialogues)
233
-
234
-
235
- def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[Tuple[str, io.BytesIO]]:
236
- """Prepare audio chunks and their corresponding text"""
237
- # Group utterances by speaker and split into chunks
238
- dialogues = group_utterances_by_speaker(utterances)
239
- chunks = chunk_dialogues(dialogues)
240
- print(f"Preparing {len(chunks)} audio segments...")
241
-
242
- # Load audio once
243
- audio = AudioSegment.from_file(audio_path)
244
-
245
- # Process each chunk
246
- prepared = []
247
- for chunk in chunks:
248
- # Extract just the needed segment
249
- segment = audio[chunk[0].start:chunk[-1].end]
250
- buffer = io.BytesIO()
251
- # Use lower quality MP3 for faster processing
252
- segment.export(buffer, format="mp3", parameters=["-q:a", "9"])
253
- # Use non-markdown format for Gemini
254
- prepared.append((format_chunk(chunk, markdown=False), buffer))
255
-
256
- return prepared
257
-
258
-
259
- def main():
260
- parser = argparse.ArgumentParser()
261
- parser.add_argument("audio_file", help="Audio file to transcribe")
262
- args = parser.parse_args()
263
-
264
- audio_path = Path(args.audio_file)
265
- if not audio_path.exists():
266
- raise FileNotFoundError(f"File not found: {audio_path}")
267
-
268
- out_dir = Path("output/transcripts")
269
- out_dir.mkdir(parents=True, exist_ok=True)
270
-
271
- try:
272
- # Get transcript
273
- transcriber = Transcriber(os.getenv("ASSEMBLYAI_API_KEY"))
274
- utterances = transcriber.get_transcript(audio_path)
275
-
276
- # Save original transcript
277
- dialogues = list(group_utterances_by_speaker(utterances)) # Convert iterator to list
278
- original = format_chunk(dialogues, markdown=True) # Use markdown for final output
279
- (out_dir / "autogenerated-transcript.md").write_text(original)
280
-
281
- # Enhance transcript
282
- enhancer = Enhancer(os.getenv("GOOGLE_API_KEY"))
283
- chunks = prepare_audio_chunks(audio_path, utterances)
284
- enhanced = asyncio.run(enhancer.enhance_chunks(chunks))
285
-
286
- # Save enhanced transcript with markdown
287
- merged = "\n\n".join(chunk.strip() for chunk in enhanced)
288
- # Apply markdown formatting to the final enhanced transcript
289
- merged = apply_markdown_formatting(merged)
290
- (out_dir / "transcript.md").write_text(merged)
291
-
292
- print("\nTranscripts saved to:")
293
- print(f"- {out_dir}/autogenerated-transcript.md")
294
- print(f"- {out_dir}/transcript.md")
295
-
296
- except Exception as e:
297
- print(f"Error: {e}")
298
- return 1
299
-
300
- return 0
301
-
302
-
303
- def apply_markdown_formatting(text: str) -> str:
304
- """Apply markdown formatting to speaker and timestamp in the transcript"""
305
- import re
306
- pattern = r"(Speaker \w+) (\d{2}:\d{2}:\d{2})"
307
- return re.sub(pattern, r"**\1** *\2*", text)
308
-
309
-
310
- if __name__ == "__main__":
311
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Empty file to make utils a package
 
 
utils/content_generator.py DELETED
@@ -1,79 +0,0 @@
1
- import anthropic
2
- from dataclasses import dataclass
3
- from pathlib import Path
4
- import asyncio
5
- import concurrent.futures
6
- import time
7
- from typing import Dict, List
8
- import pandas as pd
9
-
10
- client = anthropic.Anthropic()
11
-
12
- @dataclass
13
- class ContentRequest:
14
- prompt_key: str
15
- max_tokens: int = 2000
16
- temperature: float = 1.0
17
-
18
- class ContentGenerator:
19
- def __init__(self):
20
- self.current_prompts = self._load_default_prompts()
21
-
22
- def _load_default_prompts(self) -> Dict[str, str]:
23
- """Load default prompts from files."""
24
- return {
25
- key: Path(f"prompts/{key}.txt").read_text()
26
- for key in ["previews", "clips", "description", "timestamps", "titles_and_thumbnails"]
27
- }
28
-
29
- def _load_examples(self, filename: str, columns: List[str]) -> str:
30
- """Load examples from CSV file."""
31
- try:
32
- df = pd.read_csv(f"data/{filename}")
33
- if len(columns) == 1:
34
- return "\n\n".join(df[columns[0]].dropna().tolist())
35
-
36
- examples = []
37
- for _, row in df.iterrows():
38
- if all(pd.notna(row[col]) for col in columns):
39
- example = "\n".join(f"{col}: {row[col]}" for col in columns)
40
- examples.append(example)
41
- return "\n\n".join(examples)
42
- except Exception as e:
43
- print(f"Error loading {filename}: {str(e)}")
44
- return ""
45
-
46
- async def generate_content(self, request: ContentRequest, transcript: str) -> str:
47
- """Generate content using Claude asynchronously."""
48
- print(f"Starting {request.prompt_key} generation...")
49
- start_time = time.time()
50
-
51
- example_configs = {
52
- "clips": ("Viral Twitter Clips.csv", ["Tweet Text", "Clip Transcript"]),
53
- "description": ("Viral Episode Descriptions.csv", ["Tweet Text"]),
54
- "timestamps": ("Timestamps.csv", ["Timestamps"]),
55
- "titles_and_thumbnails": ("Titles & Thumbnails.csv", ["Titles", "Thumbnail"]),
56
- }
57
-
58
- # Build prompt with examples
59
- full_prompt = self.current_prompts[request.prompt_key]
60
- if config := example_configs.get(request.prompt_key):
61
- if examples := self._load_examples(*config):
62
- full_prompt += f"\n\nPrevious examples:\n{examples}"
63
-
64
- # Run API call in thread pool
65
- loop = asyncio.get_event_loop()
66
- with concurrent.futures.ThreadPoolExecutor() as pool:
67
- message = await loop.run_in_executor(
68
- pool,
69
- lambda: client.messages.create(
70
- model="claude-3-5-sonnet-20241022",
71
- max_tokens=request.max_tokens,
72
- temperature=request.temperature,
73
- system=full_prompt,
74
- messages=[{"role": "user", "content": [{"type": "text", "text": f"Process this transcript:\n\n{transcript}"}]}]
75
- )
76
- )
77
- result = message.content[0].text
78
- print(f"Finished {request.prompt_key} in {time.time() - start_time:.2f} seconds")
79
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/document_parser.py DELETED
@@ -1,191 +0,0 @@
1
- from pathlib import Path
2
- import tempfile
3
- import os
4
- from ebooklib import epub
5
- from bs4 import BeautifulSoup
6
-
7
- class DocumentParser:
8
- """Simple EPUB document parser that extracts chapters and their content."""
9
-
10
- def __init__(self):
11
- self._temp_file = None
12
- self._book = None
13
- self._chapters = []
14
-
15
- def load_document(self, file_data, filename=None) -> list[str]:
16
- """Load an EPUB document and extract chapter titles.
17
-
18
- Args:
19
- file_data: File data from Gradio (FileData object with read() method)
20
- filename: Optional filename (not used)
21
- """
22
- # Clean up any previous temp file
23
- self.cleanup()
24
-
25
- # Get the raw bytes from the Gradio file data
26
- content = file_data.read() if hasattr(file_data, 'read') else file_data
27
-
28
- # Save to temp file
29
- with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp:
30
- temp.write(content)
31
- self._temp_file = temp.name
32
-
33
- # Read the EPUB
34
- try:
35
- self._book = epub.read_epub(self._temp_file)
36
- print("DEBUG: Successfully read EPUB file")
37
- except Exception as e:
38
- print(f"DEBUG: Error reading EPUB: {str(e)}")
39
- raise ValueError(f"Failed to read EPUB: {str(e)}")
40
-
41
- # Extract chapters
42
- self._chapters = self._extract_chapters()
43
- print(f"DEBUG: Extracted {len(self._chapters)} chapters")
44
-
45
- # Return chapter titles
46
- return [chapter['title'] for chapter in self._chapters]
47
-
48
- def get_chapter_content(self, chapter_idx: int) -> str:
49
- """Get the content of a specific chapter."""
50
- if not self._book or not self._chapters:
51
- raise ValueError("No document loaded")
52
-
53
- if not 0 <= chapter_idx < len(self._chapters):
54
- raise ValueError(f"Invalid chapter index: {chapter_idx}")
55
-
56
- chapter = self._chapters[chapter_idx]
57
- self._current_chapter_title = chapter['title'].strip() # Store for _get_chapter_text
58
-
59
- print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}")
60
- content = self._get_chapter_text(chapter['item'])
61
- print(f"DEBUG: Extracted {len(content)} characters of content")
62
-
63
- return content
64
-
65
- def _extract_chapters(self) -> list[dict]:
66
- """Extract chapters from the EPUB file."""
67
- chapters = []
68
-
69
- # First try to get chapters from the table of contents
70
- print("DEBUG: Checking table of contents...")
71
- if hasattr(self._book, 'toc'):
72
- # Debug the TOC structure
73
- print("DEBUG: TOC structure:")
74
- for item in self._book.toc:
75
- print(f"DEBUG: TOC item type: {type(item)}")
76
- if isinstance(item, tuple):
77
- print(f"DEBUG: Tuple length: {len(item)}")
78
- if len(item) > 1:
79
- print(f"DEBUG: Second item type: {type(item[1])}")
80
- if isinstance(item[1], (list, tuple)):
81
- print(f"DEBUG: Sub-items count: {len(item[1])}")
82
-
83
- def process_toc_entries(entries, level=0):
84
- for item in entries:
85
- # Handle both Link objects and tuples
86
- if hasattr(item, 'title') and hasattr(item, 'href'):
87
- # Direct Link object
88
- doc = self._book.get_item_with_href(item.href)
89
- if doc:
90
- prefix = " " * level if level > 0 else ""
91
- chapters.append({
92
- 'title': prefix + item.title,
93
- 'item': doc
94
- })
95
- elif isinstance(item, tuple):
96
- section = item[0]
97
- # Process the section
98
- if hasattr(section, 'title') and hasattr(section, 'href'):
99
- doc = self._book.get_item_with_href(section.href)
100
- if doc:
101
- prefix = " " * level if level > 0 else ""
102
- chapters.append({
103
- 'title': prefix + section.title,
104
- 'item': doc
105
- })
106
-
107
- # Process sub-items if they exist
108
- if len(item) > 1:
109
- if isinstance(item[1], (list, tuple)):
110
- process_toc_entries(item[1], level + 1)
111
- elif hasattr(item[1], 'title'): # Single sub-item
112
- process_toc_entries([item[1]], level + 1)
113
-
114
- process_toc_entries(self._book.toc)
115
- print(f"DEBUG: Found {len(chapters)} chapters in TOC")
116
- print("DEBUG: Chapter titles found:")
117
- for ch in chapters:
118
- print(f" - {ch['title']}")
119
-
120
- # If no chapters found in TOC, scan the documents
121
- if not chapters:
122
- print("DEBUG: No chapters in TOC, scanning documents...")
123
- # Get all HTML documents
124
- docs = [item for item in self._book.get_items()
125
- if item.get_type() == epub.ITEM_DOCUMENT]
126
-
127
- print(f"DEBUG: Found {len(docs)} documents to scan")
128
-
129
- for doc in docs:
130
- soup = BeautifulSoup(doc.get_content(), 'html.parser')
131
-
132
- # Look for chapter headings
133
- headings = (
134
- soup.find_all(['h1', 'h2']) +
135
- soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower()))
136
- )
137
-
138
- for heading in headings:
139
- # Clean up the text
140
- title = ' '.join(heading.get_text().split())
141
- if title: # Only add if we have a title
142
- chapters.append({
143
- 'title': title,
144
- 'item': doc
145
- })
146
-
147
- if not chapters:
148
- print("DEBUG: No chapters found, using documents as chapters")
149
- # If still no chapters found, treat each document as a chapter
150
- for doc in self._book.get_items():
151
- if doc.get_type() == epub.ITEM_DOCUMENT:
152
- chapters.append({
153
- 'title': f"Chapter {len(chapters) + 1}",
154
- 'item': doc
155
- })
156
-
157
- return chapters
158
-
159
- def _get_chapter_text(self, item) -> str:
160
- """Extract text content from a chapter."""
161
- try:
162
- soup = BeautifulSoup(item.get_content(), 'html.parser')
163
-
164
- # Remove script and style elements
165
- for element in soup(['script', 'style']):
166
- element.decompose()
167
-
168
- # Get main content area (usually in body or main tags)
169
- content_area = soup.find('body') or soup.find('main') or soup
170
-
171
- # Get all text blocks, excluding navigation elements
172
- text_blocks = []
173
- for element in content_area.find_all(text=True, recursive=True):
174
- if (element.parent.name not in ['script', 'style', 'nav', 'header'] and
175
- element.strip()):
176
- text_blocks.append(element.strip())
177
-
178
- return '\n\n'.join(text_blocks)
179
-
180
- except Exception as e:
181
- print(f"DEBUG: Error extracting text: {str(e)}")
182
- # Fallback to simple text extraction
183
- return soup.get_text(separator='\n\n', strip=True)
184
-
185
- def cleanup(self):
186
- """Clean up temporary files."""
187
- if self._temp_file and os.path.exists(self._temp_file):
188
- os.unlink(self._temp_file)
189
- self._temp_file = None
190
- self._book = None
191
- self._chapters = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/youtube_utils.py DELETED
@@ -1,26 +0,0 @@
1
- from youtube_transcript_api import YouTubeTranscriptApi
2
- from pytube import Playlist
3
- import re
4
- from typing import Optional, List
5
-
6
- def extract_video_id(url: str) -> Optional[str]:
7
- """Extract video ID from various YouTube URL formats."""
8
- match = re.search(
9
- r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([A-Za-z0-9_-]+)",
10
- url
11
- )
12
- return match.group(1) if match else None
13
-
14
- def get_transcript(video_id: str) -> str:
15
- """Get transcript from YouTube video ID."""
16
- try:
17
- transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(["en"])
18
- return " ".join(entry["text"] for entry in transcript.fetch())
19
- except Exception as e:
20
- print(f"Error fetching transcript for {video_id}: {str(e)}")
21
- return ""
22
-
23
- def get_playlist_video_ids(playlist_url: str) -> List[str]:
24
- """Get all video IDs from a YouTube playlist."""
25
- playlist = Playlist(playlist_url)
26
- return [url.split("watch?v=")[1] for url in playlist.video_urls]