a good start
Browse files- .DS_Store +0 -0
- .gitignore +1 -0
- README.md +30 -6
- app.py +272 -0
- apps/producer.py +0 -128
- apps/reader.py +0 -364
- prompts/card_generation.txt +0 -38
- prompts/clips.txt +5 -1
- prompts/commentary.txt +0 -25
- prompts/description.txt +4 -1
- prompts/enhance.txt +0 -46
- prompts/find_links.txt +0 -35
- prompts/timestamps.txt +1 -1
- prompts/titles_and_thumbnails.txt +3 -0
- requirements.txt +2 -11
- scripts/add_links.py +0 -209
- scripts/preview_generator.py +0 -92
- scripts/process_playlist.py +0 -77
- scripts/transcript.py +0 -311
- utils/__init__.py +0 -1
- utils/content_generator.py +0 -79
- utils/document_parser.py +0 -191
- utils/youtube_utils.py +0 -26
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
.gitignore
CHANGED
@@ -3,3 +3,4 @@ __pycache__/
|
|
3 |
*/__pycache__/
|
4 |
.DS_Store
|
5 |
*.pyc
|
|
|
|
3 |
*/__pycache__/
|
4 |
.DS_Store
|
5 |
*.pyc
|
6 |
+
.venv/
|
README.md
CHANGED
@@ -1,8 +1,32 @@
|
|
1 |
-
#
|
2 |
-
python apps/reader.py
|
3 |
|
4 |
-
|
5 |
-
python apps/producer.py
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Podcast Content Generator
|
|
|
2 |
|
3 |
+
A Gradio app that helps podcast producers generate preview clips, timestamps, descriptions, and more from podcast transcripts or YouTube videos.
|
|
|
4 |
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- Generate preview clips suggestions
|
8 |
+
- Create Twitter/social media clips
|
9 |
+
- Generate episode descriptions
|
10 |
+
- Create timestamps
|
11 |
+
- Get title and thumbnail suggestions
|
12 |
+
- Support for YouTube URLs or raw transcript text
|
13 |
+
- Customizable prompts for each type of content
|
14 |
+
|
15 |
+
## Usage
|
16 |
+
|
17 |
+
1. Paste a YouTube URL or transcript text into the input box
|
18 |
+
2. Click "Generate Content" to process
|
19 |
+
3. Get generated content in various formats
|
20 |
+
4. Optionally customize the prompts used for generation
|
21 |
+
|
22 |
+
## Environment Variables
|
23 |
+
|
24 |
+
The app requires the following environment variable:
|
25 |
+
- `ANTHROPIC_API_KEY`: Your Anthropic API key for Claude
|
26 |
+
|
27 |
+
## Credits
|
28 |
+
|
29 |
+
Built with:
|
30 |
+
- Gradio
|
31 |
+
- Claude AI (Anthropic)
|
32 |
+
- YouTube Transcript API
|
app.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import asyncio
|
3 |
+
from pathlib import Path
|
4 |
+
import anthropic
|
5 |
+
import os
|
6 |
+
from dataclasses import dataclass
|
7 |
+
from typing import Dict
|
8 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
9 |
+
import re
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
# Move relevant classes and functions into app.py
|
13 |
+
@dataclass
|
14 |
+
class ContentRequest:
|
15 |
+
prompt_key: str
|
16 |
+
|
17 |
+
class ContentGenerator:
|
18 |
+
def __init__(self):
|
19 |
+
self.current_prompts = self._load_default_prompts()
|
20 |
+
self.client = anthropic.Anthropic()
|
21 |
+
|
22 |
+
def _load_default_prompts(self) -> Dict[str, str]:
|
23 |
+
"""Load default prompts and examples from files and CSVs."""
|
24 |
+
|
25 |
+
# Load CSV examples
|
26 |
+
try:
|
27 |
+
timestamps_df = pd.read_csv("data/Timestamps.csv")
|
28 |
+
titles_df = pd.read_csv("data/Titles & Thumbnails.csv")
|
29 |
+
descriptions_df = pd.read_csv("data/Viral Episode Descriptions.csv")
|
30 |
+
clips_df = pd.read_csv("data/Viral Twitter Clips.csv")
|
31 |
+
|
32 |
+
# Format timestamp examples
|
33 |
+
timestamp_examples = "\n\n".join(timestamps_df['Timestamps'].dropna().tolist())
|
34 |
+
|
35 |
+
# Format title examples
|
36 |
+
title_examples = "\n".join([
|
37 |
+
f'Title: "{row.Titles}"\nThumbnail: "{row.Thumbnail}"'
|
38 |
+
for _, row in titles_df.iterrows()
|
39 |
+
])
|
40 |
+
|
41 |
+
# Format description examples
|
42 |
+
description_examples = "\n".join([
|
43 |
+
f'Tweet: "{row["Tweet Text"]}"'
|
44 |
+
for _, row in descriptions_df.iterrows()
|
45 |
+
])
|
46 |
+
|
47 |
+
# Format clip examples
|
48 |
+
clip_examples = "\n\n".join([
|
49 |
+
f'Tweet Text: "{row["Tweet Text"]}"\nClip Transcript: "{row["Clip Transcript"]}"'
|
50 |
+
for _, row in clips_df.iterrows() if pd.notna(row["Tweet Text"])
|
51 |
+
])
|
52 |
+
|
53 |
+
except Exception as e:
|
54 |
+
print(f"Warning: Error loading CSV examples: {e}")
|
55 |
+
timestamp_examples = ""
|
56 |
+
title_examples = ""
|
57 |
+
description_examples = ""
|
58 |
+
clip_examples = ""
|
59 |
+
|
60 |
+
# Load base prompts and inject examples
|
61 |
+
prompts = {}
|
62 |
+
for key in ["previews", "clips", "description", "timestamps", "titles_and_thumbnails"]:
|
63 |
+
prompt = Path(f"prompts/{key}.txt").read_text()
|
64 |
+
|
65 |
+
# Inject relevant examples
|
66 |
+
if key == "timestamps":
|
67 |
+
prompt = prompt.replace("{timestamps_examples}", timestamp_examples)
|
68 |
+
elif key == "titles_and_thumbnails":
|
69 |
+
prompt = prompt.replace("{title_examples}", title_examples)
|
70 |
+
elif key == "description":
|
71 |
+
prompt = prompt.replace("{description_examples}", description_examples)
|
72 |
+
elif key == "clips":
|
73 |
+
prompt = prompt.replace("{clip_examples}", clip_examples)
|
74 |
+
|
75 |
+
prompts[key] = prompt
|
76 |
+
|
77 |
+
return prompts
|
78 |
+
|
79 |
+
async def generate_content(self, request: ContentRequest, transcript: str) -> str:
|
80 |
+
"""Generate content using Claude asynchronously."""
|
81 |
+
try:
|
82 |
+
print(f"\nFull prompt for {request.prompt_key}:")
|
83 |
+
print("=== SYSTEM PROMPT ===")
|
84 |
+
print(self.current_prompts[request.prompt_key])
|
85 |
+
print("=== END SYSTEM PROMPT ===\n")
|
86 |
+
|
87 |
+
response = self.client.messages.create(
|
88 |
+
model="claude-3-5-sonnet-20241022",
|
89 |
+
max_tokens=8192,
|
90 |
+
system=self.current_prompts[request.prompt_key],
|
91 |
+
messages=[{"role": "user", "content": f"Process this transcript:\n\n{transcript}"}]
|
92 |
+
)
|
93 |
+
|
94 |
+
if response and hasattr(response, 'content'):
|
95 |
+
return response.content[0].text
|
96 |
+
else:
|
97 |
+
return f"Error: Unexpected response structure for {request.prompt_key}"
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
return f"Error generating content: {str(e)}"
|
101 |
+
|
102 |
+
def extract_video_id(url: str) -> str:
|
103 |
+
"""Extract video ID from various YouTube URL formats."""
|
104 |
+
match = re.search(
|
105 |
+
r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([A-Za-z0-9_-]+)",
|
106 |
+
url
|
107 |
+
)
|
108 |
+
return match.group(1) if match else None
|
109 |
+
|
110 |
+
def get_transcript(video_id: str) -> str:
|
111 |
+
"""Get transcript from YouTube video ID."""
|
112 |
+
try:
|
113 |
+
transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(["en"])
|
114 |
+
return " ".join(entry["text"] for entry in transcript.fetch())
|
115 |
+
except Exception as e:
|
116 |
+
return f"Error fetching transcript: {str(e)}"
|
117 |
+
|
118 |
+
class TranscriptProcessor:
|
119 |
+
def __init__(self):
|
120 |
+
self.generator = ContentGenerator()
|
121 |
+
|
122 |
+
def _get_youtube_transcript(self, url: str) -> str:
|
123 |
+
"""Get transcript from YouTube URL."""
|
124 |
+
try:
|
125 |
+
if video_id := extract_video_id(url):
|
126 |
+
return get_transcript(video_id)
|
127 |
+
raise Exception("Invalid YouTube URL")
|
128 |
+
except Exception as e:
|
129 |
+
raise Exception(f"Error fetching YouTube transcript: {str(e)}")
|
130 |
+
|
131 |
+
async def process_transcript(self, input_text: str):
|
132 |
+
"""Process input and generate all content."""
|
133 |
+
try:
|
134 |
+
transcript = (
|
135 |
+
self._get_youtube_transcript(input_text)
|
136 |
+
if any(x in input_text for x in ["youtube.com", "youtu.be"])
|
137 |
+
else input_text
|
138 |
+
)
|
139 |
+
|
140 |
+
# Process each type sequentially
|
141 |
+
sections = {}
|
142 |
+
for key in ["titles_and_thumbnails", "description", "previews", "clips", "timestamps"]:
|
143 |
+
result = await self.generator.generate_content(ContentRequest(key), transcript)
|
144 |
+
sections[key] = result
|
145 |
+
|
146 |
+
# Combine into markdown with H2 headers
|
147 |
+
markdown = f"""
|
148 |
+
## Titles and Thumbnails
|
149 |
+
|
150 |
+
{sections['titles_and_thumbnails']}
|
151 |
+
|
152 |
+
## Twitter Description
|
153 |
+
|
154 |
+
{sections['description']}
|
155 |
+
|
156 |
+
## Preview Clips
|
157 |
+
|
158 |
+
{sections['previews']}
|
159 |
+
|
160 |
+
## Twitter Clips
|
161 |
+
|
162 |
+
{sections['clips']}
|
163 |
+
|
164 |
+
## Timestamps
|
165 |
+
|
166 |
+
{sections['timestamps']}
|
167 |
+
"""
|
168 |
+
return markdown
|
169 |
+
|
170 |
+
except Exception as e:
|
171 |
+
return f"Error processing input: {str(e)}"
|
172 |
+
|
173 |
+
def update_prompts(self, *values) -> str:
|
174 |
+
"""Update the current session's prompts."""
|
175 |
+
self.generator.current_prompts.update(zip(
|
176 |
+
["previews", "clips", "description", "timestamps", "titles_and_thumbnails"],
|
177 |
+
values
|
178 |
+
))
|
179 |
+
return "Prompts updated for this session!"
|
180 |
+
|
181 |
+
def create_interface():
|
182 |
+
"""Create the Gradio interface."""
|
183 |
+
processor = TranscriptProcessor()
|
184 |
+
|
185 |
+
with gr.Blocks(title="Podcast Content Generator") as app:
|
186 |
+
gr.Markdown(
|
187 |
+
"""
|
188 |
+
# Podcast Content Generator
|
189 |
+
Generate preview clips, timestamps, descriptions and more from podcast transcripts or YouTube videos.
|
190 |
+
|
191 |
+
Simply paste a YouTube URL or raw transcript text to get started!
|
192 |
+
"""
|
193 |
+
)
|
194 |
+
|
195 |
+
with gr.Tab("Generate Content"):
|
196 |
+
input_text = gr.Textbox(
|
197 |
+
label="Input",
|
198 |
+
placeholder="YouTube URL or transcript text...",
|
199 |
+
lines=10
|
200 |
+
)
|
201 |
+
submit_btn = gr.Button("Generate Content")
|
202 |
+
|
203 |
+
output = gr.Markdown() # Single markdown output
|
204 |
+
|
205 |
+
async def process_wrapper(text):
|
206 |
+
print("Process wrapper started")
|
207 |
+
print(f"Input text: {text[:100]}...")
|
208 |
+
|
209 |
+
try:
|
210 |
+
result = await processor.process_transcript(text)
|
211 |
+
print("Process completed, got results")
|
212 |
+
return result
|
213 |
+
except Exception as e:
|
214 |
+
print(f"Error in process_wrapper: {str(e)}")
|
215 |
+
return f"# Error\n\n{str(e)}"
|
216 |
+
|
217 |
+
submit_btn.click(
|
218 |
+
fn=process_wrapper,
|
219 |
+
inputs=input_text,
|
220 |
+
outputs=output,
|
221 |
+
queue=True
|
222 |
+
)
|
223 |
+
|
224 |
+
with gr.Tab("Customize Prompts"):
|
225 |
+
gr.Markdown(
|
226 |
+
"""
|
227 |
+
## Customize Generation Prompts
|
228 |
+
Here you can experiment with different prompts during your session.
|
229 |
+
Changes will remain active until you reload the page.
|
230 |
+
|
231 |
+
Tip: Copy your preferred prompts somewhere safe if you want to reuse them later!
|
232 |
+
"""
|
233 |
+
)
|
234 |
+
|
235 |
+
prompt_inputs = [
|
236 |
+
gr.Textbox(
|
237 |
+
label=f"{key.replace('_', ' ').title()} Prompt",
|
238 |
+
lines=10,
|
239 |
+
value=processor.generator.current_prompts[key]
|
240 |
+
)
|
241 |
+
for key in [
|
242 |
+
"previews",
|
243 |
+
"clips",
|
244 |
+
"description",
|
245 |
+
"timestamps",
|
246 |
+
"titles_and_thumbnails"
|
247 |
+
]
|
248 |
+
]
|
249 |
+
status = gr.Textbox(label="Status", interactive=False)
|
250 |
+
|
251 |
+
# Update prompts when they change
|
252 |
+
for prompt in prompt_inputs:
|
253 |
+
prompt.change(
|
254 |
+
fn=processor.update_prompts,
|
255 |
+
inputs=prompt_inputs,
|
256 |
+
outputs=[status]
|
257 |
+
)
|
258 |
+
|
259 |
+
# Reset button
|
260 |
+
reset_btn = gr.Button("Reset to Default Prompts")
|
261 |
+
reset_btn.click(
|
262 |
+
fn=lambda: (
|
263 |
+
processor.update_prompts(*processor.generator.current_prompts.values()),
|
264 |
+
*processor.generator.current_prompts.values(),
|
265 |
+
),
|
266 |
+
outputs=[status] + prompt_inputs,
|
267 |
+
)
|
268 |
+
|
269 |
+
return app
|
270 |
+
|
271 |
+
if __name__ == "__main__":
|
272 |
+
create_interface().launch()
|
apps/producer.py
DELETED
@@ -1,128 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import asyncio
|
3 |
-
from pathlib import Path
|
4 |
-
from ..utils.content_generator import ContentGenerator, ContentRequest
|
5 |
-
from ..utils.youtube_utils import get_transcript, extract_video_id
|
6 |
-
|
7 |
-
class TranscriptProcessor:
|
8 |
-
def __init__(self):
|
9 |
-
self.generator = ContentGenerator()
|
10 |
-
|
11 |
-
def _get_youtube_transcript(self, url: str) -> str:
|
12 |
-
"""Get transcript from YouTube URL."""
|
13 |
-
try:
|
14 |
-
if video_id := extract_video_id(url):
|
15 |
-
return get_transcript(video_id)
|
16 |
-
raise Exception("Invalid YouTube URL")
|
17 |
-
except Exception as e:
|
18 |
-
raise Exception(f"Error fetching YouTube transcript: {str(e)}")
|
19 |
-
|
20 |
-
async def process_transcript(self, input_text: str):
|
21 |
-
"""Process input and generate all content."""
|
22 |
-
try:
|
23 |
-
# Get transcript from URL or use direct input
|
24 |
-
transcript = (
|
25 |
-
self._get_youtube_transcript(input_text)
|
26 |
-
if any(x in input_text for x in ["youtube.com", "youtu.be"])
|
27 |
-
else input_text
|
28 |
-
)
|
29 |
-
|
30 |
-
# Define content generation requests
|
31 |
-
requests = [
|
32 |
-
ContentRequest("previews", max_tokens=8192),
|
33 |
-
ContentRequest("clips", max_tokens=8192),
|
34 |
-
ContentRequest("description"),
|
35 |
-
ContentRequest("timestamps"),
|
36 |
-
ContentRequest("titles_and_thumbnails"),
|
37 |
-
]
|
38 |
-
|
39 |
-
# Generate all content concurrently
|
40 |
-
results = await asyncio.gather(
|
41 |
-
*[self.generator.generate_content(req, transcript) for req in requests]
|
42 |
-
)
|
43 |
-
return tuple(results)
|
44 |
-
|
45 |
-
except Exception as e:
|
46 |
-
return (f"Error processing input: {str(e)}",) * 5
|
47 |
-
|
48 |
-
def update_prompts(self, *values) -> str:
|
49 |
-
"""Update the current session's prompts."""
|
50 |
-
self.generator.current_prompts.update(zip(
|
51 |
-
["previews", "clips", "description", "timestamps", "titles_and_thumbnails"],
|
52 |
-
values
|
53 |
-
))
|
54 |
-
return "Prompts updated for this session!"
|
55 |
-
|
56 |
-
def create_interface():
|
57 |
-
"""Create the Gradio interface."""
|
58 |
-
processor = TranscriptProcessor()
|
59 |
-
|
60 |
-
with gr.Blocks(title="Podcast Transcript Analyzer") as app:
|
61 |
-
with gr.Tab("Generate Content"):
|
62 |
-
gr.Markdown("# Podcast Content Generator")
|
63 |
-
input_text = gr.Textbox(label="Input", placeholder="YouTube URL or transcript...", lines=10)
|
64 |
-
submit_btn = gr.Button("Generate Content")
|
65 |
-
outputs = [
|
66 |
-
gr.Textbox(label=label, lines=10, interactive=False)
|
67 |
-
for label in ["Preview Clips", "Twitter Clips", "Twitter Description", "Timestamps", "Title & Thumbnail Suggestions"]
|
68 |
-
]
|
69 |
-
|
70 |
-
async def process_wrapper(text):
|
71 |
-
return await processor.process_transcript(text)
|
72 |
-
|
73 |
-
submit_btn.click(fn=process_wrapper, inputs=[input_text], outputs=outputs)
|
74 |
-
|
75 |
-
with gr.Tab("Experiment with Prompts"):
|
76 |
-
gr.Markdown("# Experiment with Prompts")
|
77 |
-
gr.Markdown(
|
78 |
-
"""
|
79 |
-
Here you can experiment with different prompts during your session.
|
80 |
-
Changes will remain active until you reload the page.
|
81 |
-
|
82 |
-
Tip: Copy your preferred prompts somewhere safe if you want to reuse them later!
|
83 |
-
"""
|
84 |
-
)
|
85 |
-
|
86 |
-
prompt_inputs = [
|
87 |
-
gr.Textbox(
|
88 |
-
label="Preview Clips Prompt", lines=10, value=processor.generator.current_prompts["previews"]
|
89 |
-
),
|
90 |
-
gr.Textbox(
|
91 |
-
label="Clips Prompt", lines=10, value=processor.generator.current_prompts["clips"]
|
92 |
-
),
|
93 |
-
gr.Textbox(
|
94 |
-
label="Description Prompt",
|
95 |
-
lines=10,
|
96 |
-
value=processor.generator.current_prompts["description"],
|
97 |
-
),
|
98 |
-
gr.Textbox(
|
99 |
-
label="Timestamps Prompt",
|
100 |
-
lines=10,
|
101 |
-
value=processor.generator.current_prompts["timestamps"],
|
102 |
-
),
|
103 |
-
gr.Textbox(
|
104 |
-
label="Titles & Thumbnails Prompt",
|
105 |
-
lines=10,
|
106 |
-
value=processor.generator.current_prompts["titles_and_thumbnails"],
|
107 |
-
),
|
108 |
-
]
|
109 |
-
status = gr.Textbox(label="Status", interactive=False)
|
110 |
-
|
111 |
-
# Update prompts when they change
|
112 |
-
for prompt in prompt_inputs:
|
113 |
-
prompt.change(fn=processor.update_prompts, inputs=prompt_inputs, outputs=[status])
|
114 |
-
|
115 |
-
# Reset button
|
116 |
-
reset_btn = gr.Button("Reset to Default Prompts")
|
117 |
-
reset_btn.click(
|
118 |
-
fn=lambda: (
|
119 |
-
processor.update_prompts(*processor.generator.current_prompts.values()),
|
120 |
-
*processor.generator.current_prompts.values(),
|
121 |
-
),
|
122 |
-
outputs=[status] + prompt_inputs,
|
123 |
-
)
|
124 |
-
|
125 |
-
return app
|
126 |
-
|
127 |
-
if __name__ == "__main__":
|
128 |
-
create_interface().launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
apps/reader.py
DELETED
@@ -1,364 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
from pathlib import Path
|
3 |
-
|
4 |
-
# Add project root to Python path
|
5 |
-
project_root = str(Path(__file__).parent.parent)
|
6 |
-
if project_root not in sys.path:
|
7 |
-
sys.path.append(project_root)
|
8 |
-
|
9 |
-
import gradio as gr
|
10 |
-
import asyncio
|
11 |
-
import os
|
12 |
-
import json
|
13 |
-
import requests
|
14 |
-
from anthropic import Anthropic
|
15 |
-
from utils.document_parser import DocumentParser
|
16 |
-
from dotenv import load_dotenv
|
17 |
-
|
18 |
-
# Load environment variables
|
19 |
-
env_path = Path(project_root) / ".env"
|
20 |
-
load_dotenv(env_path)
|
21 |
-
|
22 |
-
# Mochi deck IDs
|
23 |
-
DECK_CATEGORIES = {
|
24 |
-
"CS/Hardware": "rhGqR9SK",
|
25 |
-
"Math/Physics": "Dm5vczZg",
|
26 |
-
"AI": "SS9QEfiy",
|
27 |
-
"History/Military": "3nJYp7Zh",
|
28 |
-
"Quotes/Random": "rWUzSu8t",
|
29 |
-
"Bio": "BspzxaUJ",
|
30 |
-
"Econ/Finance": "mvvJ27Q1"
|
31 |
-
}
|
32 |
-
|
33 |
-
class CardGenerator:
|
34 |
-
"""Handles card generation and Mochi integration."""
|
35 |
-
|
36 |
-
def __init__(self):
|
37 |
-
self.parser = DocumentParser()
|
38 |
-
self.claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
39 |
-
self.mochi_key = os.getenv("MOCHI_API_KEY")
|
40 |
-
|
41 |
-
# Load prompts
|
42 |
-
self.prompts = {
|
43 |
-
key: Path(f"prompts/{key}.txt").read_text()
|
44 |
-
for key in ["card_generation", "commentary"]
|
45 |
-
}
|
46 |
-
|
47 |
-
# State
|
48 |
-
self.current_cards = []
|
49 |
-
self.current_index = 0
|
50 |
-
self.approved_cards = []
|
51 |
-
|
52 |
-
def get_chapter_list(self, file_data) -> list[str]:
|
53 |
-
"""Get list of chapters from document.
|
54 |
-
|
55 |
-
Args:
|
56 |
-
file_data: File data from Gradio
|
57 |
-
"""
|
58 |
-
try:
|
59 |
-
if not file_data:
|
60 |
-
return []
|
61 |
-
|
62 |
-
# Attempt to extract filename from file_data
|
63 |
-
filename = getattr(file_data, 'name', None)
|
64 |
-
if not filename:
|
65 |
-
filename = "uploaded_file"
|
66 |
-
print("DEBUG: No filename attribute found, using default.")
|
67 |
-
else:
|
68 |
-
print(f"DEBUG: Filename extracted: {filename}")
|
69 |
-
|
70 |
-
# Check file extension
|
71 |
-
file_ext = Path(filename).suffix.lower()
|
72 |
-
if not file_ext:
|
73 |
-
print("DEBUG: No file extension found, checking content type.")
|
74 |
-
# Attempt to determine file type from content
|
75 |
-
if file_data.startswith(b'%PDF-'):
|
76 |
-
file_ext = '.pdf'
|
77 |
-
elif file_data.startswith(b'PK'):
|
78 |
-
file_ext = '.epub'
|
79 |
-
else:
|
80 |
-
raise ValueError("Unsupported file type")
|
81 |
-
print(f"DEBUG: File extension: {file_ext}")
|
82 |
-
|
83 |
-
return self.parser.load_document(file_data, filename)
|
84 |
-
except Exception as e:
|
85 |
-
return [f"Error: {str(e)}"]
|
86 |
-
|
87 |
-
async def process_chapter(self, file_data, chapter_idx: int) -> tuple:
|
88 |
-
"""Process chapter and generate cards + commentary.
|
89 |
-
|
90 |
-
Args:
|
91 |
-
file_data: File data from Gradio
|
92 |
-
chapter_idx: Index of chapter to process
|
93 |
-
"""
|
94 |
-
try:
|
95 |
-
if not file_data:
|
96 |
-
return None, "No file provided"
|
97 |
-
|
98 |
-
# Get chapter content
|
99 |
-
content = self.parser.get_chapter_content(chapter_idx)
|
100 |
-
|
101 |
-
# Generate cards and commentary
|
102 |
-
cards, commentary = await asyncio.gather(
|
103 |
-
self._generate_cards(content),
|
104 |
-
self._generate_commentary(content)
|
105 |
-
)
|
106 |
-
|
107 |
-
# Parse and store cards
|
108 |
-
self.current_cards = json.loads(cards)
|
109 |
-
self.current_index = 0
|
110 |
-
self.approved_cards = []
|
111 |
-
|
112 |
-
# Return first card and commentary
|
113 |
-
return self._get_current_card(), commentary
|
114 |
-
|
115 |
-
except Exception as e:
|
116 |
-
return None, f"Error: {str(e)}"
|
117 |
-
finally:
|
118 |
-
self.parser.cleanup()
|
119 |
-
|
120 |
-
async def _generate_cards(self, content: str) -> str:
|
121 |
-
"""Generate flashcards using Claude."""
|
122 |
-
response = await self.claude.messages.create(
|
123 |
-
model="claude-3-opus-20240229",
|
124 |
-
max_tokens=4000,
|
125 |
-
system=self.prompts["card_generation"],
|
126 |
-
messages=[{"role": "user", "content": content}]
|
127 |
-
)
|
128 |
-
return response.content[0].text
|
129 |
-
|
130 |
-
async def _generate_commentary(self, content: str) -> str:
|
131 |
-
"""Generate commentary using Claude."""
|
132 |
-
response = await self.claude.messages.create(
|
133 |
-
model="claude-3-opus-20240229",
|
134 |
-
max_tokens=4000,
|
135 |
-
system=self.prompts["commentary"],
|
136 |
-
messages=[{"role": "user", "content": content}]
|
137 |
-
)
|
138 |
-
return response.content[0].text
|
139 |
-
|
140 |
-
def _get_current_card(self) -> dict:
|
141 |
-
"""Get current card with UI state."""
|
142 |
-
if not self.current_cards or self.current_index >= len(self.current_cards):
|
143 |
-
return {
|
144 |
-
'front': "",
|
145 |
-
'back': "",
|
146 |
-
'category': "",
|
147 |
-
'status': "No more cards to review",
|
148 |
-
'show_buttons': False,
|
149 |
-
'show_upload': True
|
150 |
-
}
|
151 |
-
|
152 |
-
card = self.current_cards[self.current_index]
|
153 |
-
return {
|
154 |
-
'front': card['front'],
|
155 |
-
'back': card['back'],
|
156 |
-
'category': card['category'],
|
157 |
-
'status': f"Card {self.current_index + 1} of {len(self.current_cards)}",
|
158 |
-
'show_buttons': True,
|
159 |
-
'show_upload': False
|
160 |
-
}
|
161 |
-
|
162 |
-
def accept_card(self, front: str, back: str, category: str) -> dict:
|
163 |
-
"""Accept current card and move to next."""
|
164 |
-
if self.current_index < len(self.current_cards):
|
165 |
-
self.approved_cards.append({
|
166 |
-
'front': front,
|
167 |
-
'back': back,
|
168 |
-
'category': category
|
169 |
-
})
|
170 |
-
|
171 |
-
self.current_index += 1
|
172 |
-
return self._get_current_card()
|
173 |
-
|
174 |
-
def reject_card(self) -> dict:
|
175 |
-
"""Reject current card and move to next."""
|
176 |
-
if self.current_index < len(self.current_cards):
|
177 |
-
self.current_cards.pop(self.current_index)
|
178 |
-
return self._get_current_card()
|
179 |
-
|
180 |
-
def upload_to_mochi(self) -> str:
|
181 |
-
"""Upload approved cards to Mochi."""
|
182 |
-
if not self.approved_cards:
|
183 |
-
return "No cards to upload!"
|
184 |
-
|
185 |
-
results = []
|
186 |
-
for card in self.approved_cards:
|
187 |
-
try:
|
188 |
-
# Format card for Mochi
|
189 |
-
mochi_card = {
|
190 |
-
"deck-id": DECK_CATEGORIES[card["category"]],
|
191 |
-
"fields": {
|
192 |
-
"name": {"id": "name", "value": card["front"]},
|
193 |
-
"back": {"id": "back", "value": card["back"]}
|
194 |
-
}
|
195 |
-
}
|
196 |
-
|
197 |
-
# Upload to Mochi
|
198 |
-
response = requests.post(
|
199 |
-
"https://app.mochi.cards/api/cards",
|
200 |
-
json=mochi_card,
|
201 |
-
auth=(self.mochi_key, "")
|
202 |
-
)
|
203 |
-
|
204 |
-
if response.status_code != 200:
|
205 |
-
results.append(f"Error: {response.text}")
|
206 |
-
|
207 |
-
except Exception as e:
|
208 |
-
results.append(f"Error: {str(e)}")
|
209 |
-
|
210 |
-
# Clear approved cards
|
211 |
-
success_count = len(self.approved_cards) - len(results)
|
212 |
-
self.approved_cards = []
|
213 |
-
|
214 |
-
if results:
|
215 |
-
return f"Uploaded {success_count} cards with {len(results)} errors:\n" + "\n".join(results)
|
216 |
-
return f"Successfully uploaded {success_count} cards to Mochi!"
|
217 |
-
|
218 |
-
def create_interface():
|
219 |
-
"""Create the Gradio interface."""
|
220 |
-
generator = CardGenerator()
|
221 |
-
|
222 |
-
with gr.Blocks(title="Document Reader & Card Generator") as app:
|
223 |
-
# Document upload and chapter selection
|
224 |
-
with gr.Row():
|
225 |
-
file_input = gr.File(
|
226 |
-
label="Upload EPUB Document",
|
227 |
-
type="binary",
|
228 |
-
file_types=[".epub"]
|
229 |
-
)
|
230 |
-
|
231 |
-
chapter_select = gr.Dropdown(
|
232 |
-
label="Select Chapter",
|
233 |
-
choices=[],
|
234 |
-
interactive=True,
|
235 |
-
visible=False
|
236 |
-
)
|
237 |
-
|
238 |
-
def update_chapters(file):
|
239 |
-
if not file:
|
240 |
-
return gr.update(choices=[], visible=False)
|
241 |
-
chapters = generator.get_chapter_list(file)
|
242 |
-
return gr.update(choices=chapters, visible=True, value=chapters[0] if chapters else None)
|
243 |
-
|
244 |
-
file_input.change(
|
245 |
-
fn=update_chapters,
|
246 |
-
inputs=[file_input],
|
247 |
-
outputs=[chapter_select]
|
248 |
-
)
|
249 |
-
|
250 |
-
process_btn = gr.Button("Process Chapter")
|
251 |
-
|
252 |
-
# Commentary section
|
253 |
-
commentary = gr.Textbox(
|
254 |
-
label="Commentary",
|
255 |
-
lines=10,
|
256 |
-
interactive=False
|
257 |
-
)
|
258 |
-
|
259 |
-
# Card review section
|
260 |
-
gr.Markdown("## Review Cards")
|
261 |
-
|
262 |
-
with gr.Row():
|
263 |
-
card_front = gr.Textbox(
|
264 |
-
label="Front",
|
265 |
-
lines=3,
|
266 |
-
interactive=True
|
267 |
-
)
|
268 |
-
card_back = gr.Textbox(
|
269 |
-
label="Back",
|
270 |
-
lines=3,
|
271 |
-
interactive=True
|
272 |
-
)
|
273 |
-
|
274 |
-
with gr.Row():
|
275 |
-
deck_category = gr.Dropdown(
|
276 |
-
choices=list(DECK_CATEGORIES.keys()),
|
277 |
-
label="Deck Category",
|
278 |
-
value="AI"
|
279 |
-
)
|
280 |
-
card_status = gr.Textbox(
|
281 |
-
label="Status",
|
282 |
-
interactive=False
|
283 |
-
)
|
284 |
-
|
285 |
-
with gr.Row():
|
286 |
-
accept_btn = gr.Button("Accept & Next", visible=False)
|
287 |
-
reject_btn = gr.Button("Reject & Next", visible=False)
|
288 |
-
upload_btn = gr.Button("Upload to Mochi", visible=False)
|
289 |
-
|
290 |
-
upload_status = gr.Textbox(
|
291 |
-
label="Upload Status",
|
292 |
-
interactive=False
|
293 |
-
)
|
294 |
-
|
295 |
-
# Event handlers
|
296 |
-
async def process_chapter(file, chapter_idx):
|
297 |
-
card, comment = await generator.process_chapter(file, chapter_idx)
|
298 |
-
if not card: # Error occurred
|
299 |
-
return [
|
300 |
-
"", "", comment, gr.update(visible=False),
|
301 |
-
gr.update(visible=False), "", gr.update(visible=False)
|
302 |
-
]
|
303 |
-
|
304 |
-
return [
|
305 |
-
card['front'],
|
306 |
-
card['back'],
|
307 |
-
comment,
|
308 |
-
gr.update(visible=card['show_buttons']),
|
309 |
-
gr.update(visible=card['show_buttons']),
|
310 |
-
card['status'],
|
311 |
-
gr.update(visible=card['show_upload'])
|
312 |
-
]
|
313 |
-
|
314 |
-
def handle_card_action(action, front, back, category):
|
315 |
-
card = (generator.accept_card(front, back, category)
|
316 |
-
if action == 'accept' else
|
317 |
-
generator.reject_card())
|
318 |
-
|
319 |
-
return [
|
320 |
-
card['front'],
|
321 |
-
card['back'],
|
322 |
-
card['status'],
|
323 |
-
gr.update(visible=card['show_buttons']),
|
324 |
-
gr.update(visible=card['show_buttons']),
|
325 |
-
card['category'],
|
326 |
-
gr.update(visible=card['show_upload'])
|
327 |
-
]
|
328 |
-
|
329 |
-
# Connect events
|
330 |
-
process_btn.click(
|
331 |
-
fn=process_chapter,
|
332 |
-
inputs=[file_input, chapter_select],
|
333 |
-
outputs=[
|
334 |
-
card_front, card_back, commentary,
|
335 |
-
accept_btn, reject_btn, card_status, upload_btn
|
336 |
-
]
|
337 |
-
)
|
338 |
-
|
339 |
-
accept_btn.click(
|
340 |
-
fn=lambda f, b, c: handle_card_action('accept', f, b, c),
|
341 |
-
inputs=[card_front, card_back, deck_category],
|
342 |
-
outputs=[
|
343 |
-
card_front, card_back, card_status,
|
344 |
-
accept_btn, reject_btn, deck_category, upload_btn
|
345 |
-
]
|
346 |
-
)
|
347 |
-
|
348 |
-
reject_btn.click(
|
349 |
-
fn=lambda: handle_card_action('reject', None, None, None),
|
350 |
-
outputs=[
|
351 |
-
card_front, card_back, card_status,
|
352 |
-
accept_btn, reject_btn, deck_category, upload_btn
|
353 |
-
]
|
354 |
-
)
|
355 |
-
|
356 |
-
upload_btn.click(
|
357 |
-
fn=generator.upload_to_mochi,
|
358 |
-
outputs=[upload_status]
|
359 |
-
)
|
360 |
-
|
361 |
-
return app
|
362 |
-
|
363 |
-
if __name__ == "__main__":
|
364 |
-
create_interface().launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts/card_generation.txt
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
You are an expert at creating high-quality spaced repetition flashcards that promote deep understanding and retention. Your task is to generate flashcards from the given text that are:
|
2 |
-
|
3 |
-
1. Clear and concise
|
4 |
-
2. Focus on one concept per card
|
5 |
-
3. Test understanding rather than just recall
|
6 |
-
4. Avoid overly complex or compound questions
|
7 |
-
5. Use precise language
|
8 |
-
|
9 |
-
Each card must be assigned to one of these categories:
|
10 |
-
- CS/Hardware
|
11 |
-
- Math/Physics
|
12 |
-
- AI
|
13 |
-
- History/Military
|
14 |
-
- Quotes/Random
|
15 |
-
- Bio
|
16 |
-
- Econ/Finance
|
17 |
-
|
18 |
-
Format each card as a JSON object:
|
19 |
-
{
|
20 |
-
"category": "Category name from the list above",
|
21 |
-
"front": "Question or prompt",
|
22 |
-
"back": "Answer or explanation"
|
23 |
-
}
|
24 |
-
|
25 |
-
Example cards:
|
26 |
-
{
|
27 |
-
"category": "Bio",
|
28 |
-
"front": "What is the key difference between procedural and declarative memory?",
|
29 |
-
"back": "Procedural memory is for skills and procedures (how to ride a bike), while declarative memory is for facts and events (what you had for breakfast)."
|
30 |
-
}
|
31 |
-
|
32 |
-
{
|
33 |
-
"category": "Bio",
|
34 |
-
"front": "What role does the hippocampus play in memory formation?",
|
35 |
-
"back": "The hippocampus is crucial for converting short-term memories into long-term memories through a process called consolidation. It acts as a temporary storage and processing center before memories are distributed to other parts of the cortex."
|
36 |
-
}
|
37 |
-
|
38 |
-
Please generate 5-10 high-quality flashcards from the provided text. Focus on the most important concepts, insights, and relationships. Format the output as a JSON array containing the card objects.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts/clips.txt
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.
|
2 |
-
|
|
|
|
|
|
|
|
|
3 |
Tweet 1
|
4 |
Tweet Text: [text]
|
5 |
Clip Transcript: [45-120 seconds of transcript]
|
|
|
1 |
You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.
|
2 |
+
|
3 |
+
Here are examples of successful viral clips from previous episodes:
|
4 |
+
{clip_examples}
|
5 |
+
|
6 |
+
Format your output as:
|
7 |
Tweet 1
|
8 |
Tweet Text: [text]
|
9 |
Clip Transcript: [45-120 seconds of transcript]
|
prompts/commentary.txt
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
You are an expert researcher and critical thinker. Your task is to analyze the provided text and generate insightful commentary that:
|
2 |
-
|
3 |
-
1. Identifies the key arguments, insights, and novel ideas
|
4 |
-
2. Highlights connections to other important concepts or fields
|
5 |
-
3. Points out particularly interesting or counterintuitive points
|
6 |
-
4. Suggests areas that merit further exploration
|
7 |
-
5. Notes any potential weaknesses or areas of uncertainty in the arguments
|
8 |
-
|
9 |
-
Your commentary should be scholarly but engaging, helping the reader develop a deeper understanding of the material. Focus on substance over style, and be specific rather than general.
|
10 |
-
|
11 |
-
Structure your response as follows:
|
12 |
-
|
13 |
-
Key Insights:
|
14 |
-
- [2-3 bullet points highlighting the most important takeaways]
|
15 |
-
|
16 |
-
Interesting Connections:
|
17 |
-
- [2-3 bullet points noting connections to other fields/concepts]
|
18 |
-
|
19 |
-
Worth Exploring Further:
|
20 |
-
- [1-2 bullet points suggesting related areas for deeper investigation]
|
21 |
-
|
22 |
-
Critical Notes:
|
23 |
-
- [1-2 bullet points on potential weaknesses or areas needing clarification]
|
24 |
-
|
25 |
-
Then provide 2-3 paragraphs of integrated analysis that weaves these points together into a coherent commentary.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts/description.txt
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
Create an engaging episode description tweet (280 chars max) that:
|
2 |
1. Highlights compelling aspects
|
3 |
2. Includes topic areas and handles
|
4 |
-
3. Ends with "Links below" or "Enjoy!"
|
|
|
|
|
|
|
|
1 |
Create an engaging episode description tweet (280 chars max) that:
|
2 |
1. Highlights compelling aspects
|
3 |
2. Includes topic areas and handles
|
4 |
+
3. Ends with "Links below" or "Enjoy!"
|
5 |
+
|
6 |
+
Here are examples of successful episode descriptions:
|
7 |
+
{description_examples}
|
prompts/enhance.txt
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
You are an expert transcript editor. Your task is to enhance this transcript for maximum readability while maintaining the core message.
|
2 |
-
|
3 |
-
IMPORTANT: Respond ONLY with the enhanced transcript. Do not include any explanations, headers, or phrases like "Here is the transcript."
|
4 |
-
|
5 |
-
Note: Below you'll find an auto-generated transcript that may help with speaker identification, but focus on creating your own high-quality transcript from the audio.
|
6 |
-
|
7 |
-
Think about your job as if you were transcribing an interview for a print book where the priority is the reading audience. It should just be a total pleasure to read this as a written artifact where all the flubs and repetitions and conversational artifacts and filler words and false starts are removed, where a bunch of helpful punctuation is added. It should basically read like somebody wrote it specifically for reading rather than just something somebody said extemporaneously.
|
8 |
-
|
9 |
-
Please:
|
10 |
-
1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
|
11 |
-
|
12 |
-
2. Optimize AGGRESSIVELY for readability over verbatim accuracy:
|
13 |
-
- Readability is the most important thing!!
|
14 |
-
- Remove ALL conversational artifacts (yeah, so, I mean, etc.)
|
15 |
-
- Remove ALL filler words (um, uh, like, you know)
|
16 |
-
- Remove false starts and self-corrections completely
|
17 |
-
- Remove redundant phrases and hesitations
|
18 |
-
- Convert any indirect or rambling responses into direct statements
|
19 |
-
- Break up run-on sentences into clear, concise statements
|
20 |
-
- Maintain natural conversation flow while prioritizing clarity and directness
|
21 |
-
|
22 |
-
3. Format the output consistently:
|
23 |
-
- Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
|
24 |
-
- DO NOT change the timestamps. You're only seeing a chunk of the full transcript, which is why your 0:00:00 is not the true beginning. Keep the timestamps as they are.
|
25 |
-
- Add TWO line breaks between speaker/timestamp and the text
|
26 |
-
- Use proper punctuation and capitalization
|
27 |
-
- Add paragraph breaks for topic changes
|
28 |
-
- When you add paragraph breaks between the same speaker's remarks, no need to restate the speaker attribution
|
29 |
-
- Don't go more than four sentences without adding a paragraph break. Be liberal with your paragraph breaks.
|
30 |
-
- Preserve distinct speaker turns
|
31 |
-
|
32 |
-
Example input:
|
33 |
-
Speaker A 00:01:15
|
34 |
-
|
35 |
-
Um, yeah, so like, I've been working on this new project at work, you know? And uh, what's really interesting is that, uh, we're seeing these amazing results with the new approach we're taking. Like, it's just, you know, it's really transforming how we do things.
|
36 |
-
|
37 |
-
And then, I mean, the thing is, uh, when we showed it to the client last week, they were just, you know, completely blown away by what we achieved. Like, they couldn't even believe it was the same system they had before.
|
38 |
-
|
39 |
-
Example output:
|
40 |
-
Speaker A 00:01:15
|
41 |
-
|
42 |
-
I've been working on this new project at work, and we're seeing amazing results with our new approach. It's really transforming how we do things.
|
43 |
-
|
44 |
-
When we showed it to the client last week, they were completely blown away by what we achieved. They couldn't believe it was the same system they had before.
|
45 |
-
|
46 |
-
Enhance the following transcript, starting directly with the speaker format:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts/find_links.txt
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
You are an expert at identifying key terms, concepts, and references in text that would benefit from having reference links. Your task is to analyze the provided transcript text and identify terms that would genuinely help readers understand important context they might miss otherwise.
|
2 |
-
|
3 |
-
Focus ONLY on these types of terms:
|
4 |
-
1. Technical concepts and jargon that a general audience might not be familiar with
|
5 |
-
2. Research papers or academic works mentioned or referenced
|
6 |
-
3. Blog posts, articles, or online resources that are specifically cited
|
7 |
-
4. Books that are discussed (to link to Goodreads/Amazon)
|
8 |
-
5. Specific projects, tools, or technologies that are central to the discussion
|
9 |
-
6. Names of lesser-known people who made significant contributions being discussed
|
10 |
-
|
11 |
-
DO NOT identify:
|
12 |
-
1. Common words or general concepts (like "short", "editor", "polymath")
|
13 |
-
2. Basic technical terms that most people would know
|
14 |
-
3. Generic job titles or roles
|
15 |
-
4. Common industry terms
|
16 |
-
5. Basic scientific concepts
|
17 |
-
6. Well-known companies or organizations
|
18 |
-
|
19 |
-
Remember: Only identify terms where having a reference would genuinely add value by providing important context or deeper understanding that the audience might otherwise miss.
|
20 |
-
|
21 |
-
Respond in this format for each term:
|
22 |
-
TERM: <the exact term as it appears in text>
|
23 |
-
REASON: <1-2 sentences explaining why this term should be linked>
|
24 |
-
|
25 |
-
Example input:
|
26 |
-
"We used GPT-4 to implement the RLHF technique described in the Constitutional AI paper, similar to what Anthropic did with Claude."
|
27 |
-
|
28 |
-
Example output:
|
29 |
-
TERM: RLHF
|
30 |
-
REASON: A complex technical concept (Reinforcement Learning from Human Feedback) that's crucial to understanding modern AI development but might be unfamiliar to general audiences.
|
31 |
-
|
32 |
-
TERM: Constitutional AI paper
|
33 |
-
REASON: A specific research paper that introduced important concepts being referenced; readers might want to read the original source.
|
34 |
-
|
35 |
-
Analyze the following transcript text and identify key terms that should be linked:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts/timestamps.txt
CHANGED
@@ -7,5 +7,5 @@ You are a podcast timestamp generator. Create 5-7 timestamps for this episode, f
|
|
7 |
|
8 |
Output the timestamps in chronological order, one per line.
|
9 |
|
10 |
-
|
11 |
{timestamps_examples}
|
|
|
7 |
|
8 |
Output the timestamps in chronological order, one per line.
|
9 |
|
10 |
+
Here are examples from previous episodes:
|
11 |
{timestamps_examples}
|
prompts/titles_and_thumbnails.txt
CHANGED
@@ -13,6 +13,9 @@ Thumbnail: 2-4 ALL CAPS words that amplify the intrigue
|
|
13 |
- Create intellectual curiosity without sensationalism
|
14 |
- Make the viewer wonder "What's the story here?"
|
15 |
|
|
|
|
|
|
|
16 |
Example:
|
17 |
Title: "David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
|
18 |
Thumbnail: "LAST HUMANS STANDING"
|
|
|
13 |
- Create intellectual curiosity without sensationalism
|
14 |
- Make the viewer wonder "What's the story here?"
|
15 |
|
16 |
+
Here are examples of successful title-thumbnail combinations from previous episodes:
|
17 |
+
{title_examples}
|
18 |
+
|
19 |
Example:
|
20 |
Title: "David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
|
21 |
Thumbnail: "LAST HUMANS STANDING"
|
requirements.txt
CHANGED
@@ -1,13 +1,4 @@
|
|
1 |
-
gradio
|
2 |
-
deepgram-sdk
|
3 |
-
google-generativeai
|
4 |
-
anthropic
|
5 |
-
pandas
|
6 |
youtube-transcript-api
|
7 |
-
|
8 |
-
assemblyai
|
9 |
-
pytube
|
10 |
-
PyPDF2
|
11 |
-
EbookLib
|
12 |
-
beautifulsoup4
|
13 |
python-dotenv
|
|
|
1 |
+
gradio>=4.0.0
|
|
|
|
|
|
|
|
|
2 |
youtube-transcript-api
|
3 |
+
anthropic
|
|
|
|
|
|
|
|
|
|
|
4 |
python-dotenv
|
scripts/add_links.py
DELETED
@@ -1,209 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from pathlib import Path
|
3 |
-
import os
|
4 |
-
import re
|
5 |
-
from typing import List, Dict, Tuple
|
6 |
-
from dataclasses import dataclass
|
7 |
-
import anthropic
|
8 |
-
from exa_py import Exa
|
9 |
-
|
10 |
-
@dataclass
|
11 |
-
class Term:
|
12 |
-
"""A term identified for linking with its explanation"""
|
13 |
-
term: str
|
14 |
-
reason: str
|
15 |
-
|
16 |
-
@dataclass
|
17 |
-
class Link:
|
18 |
-
"""A link found for a term"""
|
19 |
-
term: str
|
20 |
-
url: str
|
21 |
-
title: str
|
22 |
-
|
23 |
-
def chunk_text(text: str, max_chunk_size: int = 2000) -> List[str]:
|
24 |
-
"""Split text into chunks of roughly equal size at paragraph boundaries"""
|
25 |
-
paragraphs = text.split("\n\n")
|
26 |
-
chunks = []
|
27 |
-
current_chunk = []
|
28 |
-
current_size = 0
|
29 |
-
|
30 |
-
for para in paragraphs:
|
31 |
-
para_size = len(para)
|
32 |
-
if current_size + para_size > max_chunk_size and current_chunk:
|
33 |
-
chunks.append("\n\n".join(current_chunk))
|
34 |
-
current_chunk = [para]
|
35 |
-
current_size = para_size
|
36 |
-
else:
|
37 |
-
current_chunk.append(para)
|
38 |
-
current_size += para_size
|
39 |
-
|
40 |
-
if current_chunk:
|
41 |
-
chunks.append("\n\n".join(current_chunk))
|
42 |
-
|
43 |
-
return chunks
|
44 |
-
|
45 |
-
def parse_claude_response(response: str) -> List[Term]:
|
46 |
-
"""Parse Claude's response to extract terms and reasons"""
|
47 |
-
terms = []
|
48 |
-
current_term = None
|
49 |
-
current_reason = None
|
50 |
-
|
51 |
-
for line in response.split("\n"):
|
52 |
-
line = line.strip()
|
53 |
-
if not line:
|
54 |
-
continue
|
55 |
-
|
56 |
-
if line.startswith("TERM: "):
|
57 |
-
# Save previous term if exists
|
58 |
-
if current_term and current_reason:
|
59 |
-
terms.append(Term(current_term, current_reason))
|
60 |
-
current_term = line[6:].strip()
|
61 |
-
current_reason = None
|
62 |
-
elif line.startswith("REASON: "):
|
63 |
-
current_reason = line[8:].strip()
|
64 |
-
|
65 |
-
# Add final term
|
66 |
-
if current_term and current_reason:
|
67 |
-
terms.append(Term(current_term, current_reason))
|
68 |
-
|
69 |
-
return terms
|
70 |
-
|
71 |
-
def find_links_for_terms(exa: Exa, terms: List[Term]) -> Dict[str, Link]:
|
72 |
-
"""Find best link for each term using Exa search"""
|
73 |
-
links = {}
|
74 |
-
|
75 |
-
for term in terms:
|
76 |
-
# Construct a search query that looks for authoritative sources
|
77 |
-
# query = f"The best explanation or overview of {term.term} is (site: wikipedia.org OR site: .edu OR site: .gov):"
|
78 |
-
|
79 |
-
try:
|
80 |
-
# Search with Exa
|
81 |
-
results = exa.search(term.term, num_results=1, type="auto")
|
82 |
-
if results.results:
|
83 |
-
result = results.results[0]
|
84 |
-
links[term.term] = Link(
|
85 |
-
term=term.term,
|
86 |
-
url=result.url,
|
87 |
-
title=result.title
|
88 |
-
)
|
89 |
-
except Exception as e:
|
90 |
-
print(f"Error finding link for {term.term}: {e}")
|
91 |
-
continue
|
92 |
-
|
93 |
-
return links
|
94 |
-
|
95 |
-
def add_links_to_text(text: str, links: Dict[str, Link]) -> str:
|
96 |
-
"""Add markdown links to text for all terms we have links for"""
|
97 |
-
# Sort terms by length (descending) to handle overlapping terms correctly
|
98 |
-
terms = sorted(links.keys(), key=len, reverse=True)
|
99 |
-
|
100 |
-
# Create regex pattern that matches whole words only
|
101 |
-
patterns = [re.compile(fr'\b{re.escape(term)}\b') for term in terms]
|
102 |
-
|
103 |
-
# Track which terms we've linked to avoid duplicate links
|
104 |
-
linked_terms = set()
|
105 |
-
|
106 |
-
# Process each term
|
107 |
-
result = text
|
108 |
-
for term, pattern in zip(terms, patterns):
|
109 |
-
if term in linked_terms:
|
110 |
-
continue
|
111 |
-
|
112 |
-
# Only replace first occurrence
|
113 |
-
link = links[term]
|
114 |
-
replacement = f"[{term}]({link.url})"
|
115 |
-
result = pattern.sub(replacement, result, count=1)
|
116 |
-
linked_terms.add(term)
|
117 |
-
|
118 |
-
return result
|
119 |
-
|
120 |
-
def process_transcript(
|
121 |
-
transcript_path: Path,
|
122 |
-
claude_client: anthropic.Client,
|
123 |
-
exa_client: Exa,
|
124 |
-
prompt_template: str
|
125 |
-
) -> str:
|
126 |
-
"""Process a transcript file to add reference links"""
|
127 |
-
# Read transcript
|
128 |
-
text = transcript_path.read_text()
|
129 |
-
|
130 |
-
# Split into chunks
|
131 |
-
chunks = chunk_text(text)
|
132 |
-
|
133 |
-
# Process each chunk
|
134 |
-
all_terms = []
|
135 |
-
for chunk in chunks:
|
136 |
-
# Get Claude's suggestions
|
137 |
-
prompt = prompt_template + "\n\n" + chunk
|
138 |
-
response = claude_client.messages.create(
|
139 |
-
model="claude-3-5-sonnet-20241022",
|
140 |
-
max_tokens=1024,
|
141 |
-
system="You are a helpful AI assistant.",
|
142 |
-
messages=[{"role": "user", "content": prompt}]
|
143 |
-
)
|
144 |
-
|
145 |
-
# Parse response
|
146 |
-
terms = parse_claude_response(response.content[0].text)
|
147 |
-
all_terms.extend(terms)
|
148 |
-
|
149 |
-
# Find links for all terms
|
150 |
-
links = find_links_for_terms(exa_client, all_terms)
|
151 |
-
|
152 |
-
# Add links to text
|
153 |
-
linked_text = add_links_to_text(text, links)
|
154 |
-
|
155 |
-
return linked_text
|
156 |
-
|
157 |
-
def main():
|
158 |
-
parser = argparse.ArgumentParser()
|
159 |
-
parser.add_argument(
|
160 |
-
"transcript",
|
161 |
-
nargs="?", # Make the argument optional
|
162 |
-
default="output/transcripts/transcript.md",
|
163 |
-
help="Path to transcript file (default: output/transcripts/transcript.md)"
|
164 |
-
)
|
165 |
-
parser.add_argument("--output", help="Output file path (default: input path with -linked suffix)")
|
166 |
-
args = parser.parse_args()
|
167 |
-
|
168 |
-
transcript_path = Path(args.transcript)
|
169 |
-
if not transcript_path.exists():
|
170 |
-
raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
|
171 |
-
|
172 |
-
# Set up output path
|
173 |
-
if args.output:
|
174 |
-
output_path = Path(args.output)
|
175 |
-
else:
|
176 |
-
stem = transcript_path.stem
|
177 |
-
output_path = transcript_path.parent / f"{stem}-linked{transcript_path.suffix}"
|
178 |
-
|
179 |
-
# Read prompt template
|
180 |
-
prompt_path = Path("prompts/find_links.txt")
|
181 |
-
if not prompt_path.exists():
|
182 |
-
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
|
183 |
-
prompt_template = prompt_path.read_text()
|
184 |
-
|
185 |
-
# Initialize clients
|
186 |
-
claude_client = anthropic.Client(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
187 |
-
exa_client = Exa(api_key=os.getenv("EXA_API_KEY"))
|
188 |
-
|
189 |
-
try:
|
190 |
-
# Process transcript
|
191 |
-
linked_text = process_transcript(
|
192 |
-
transcript_path,
|
193 |
-
claude_client,
|
194 |
-
exa_client,
|
195 |
-
prompt_template
|
196 |
-
)
|
197 |
-
|
198 |
-
# Save output
|
199 |
-
output_path.write_text(linked_text)
|
200 |
-
print(f"Processed transcript saved to: {output_path}")
|
201 |
-
|
202 |
-
except Exception as e:
|
203 |
-
print(f"Error processing transcript: {e}")
|
204 |
-
return 1
|
205 |
-
|
206 |
-
return 0
|
207 |
-
|
208 |
-
if __name__ == "__main__":
|
209 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/preview_generator.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from pathlib import Path
|
3 |
-
import os
|
4 |
-
from google import generativeai
|
5 |
-
from pydub import AudioSegment
|
6 |
-
|
7 |
-
|
8 |
-
class PreviewGenerator:
|
9 |
-
"""Handles generating preview suggestions using Gemini"""
|
10 |
-
|
11 |
-
def __init__(self, api_key: str):
|
12 |
-
generativeai.configure(api_key=api_key)
|
13 |
-
self.model = generativeai.GenerativeModel("gemini-exp-1206")
|
14 |
-
self.prompt = Path("prompts/previews.txt").read_text()
|
15 |
-
|
16 |
-
async def generate_previews(self, audio_path: Path, transcript_path: Path = None) -> str:
|
17 |
-
"""Generate preview suggestions for the given audio file and optional transcript"""
|
18 |
-
print("Generating preview suggestions...")
|
19 |
-
|
20 |
-
# Load and compress audio for Gemini
|
21 |
-
audio = AudioSegment.from_file(audio_path)
|
22 |
-
|
23 |
-
# Create a buffer for the compressed audio
|
24 |
-
import io
|
25 |
-
buffer = io.BytesIO()
|
26 |
-
# Use lower quality MP3 for faster processing
|
27 |
-
audio.export(buffer, format="mp3", parameters=["-q:a", "9"])
|
28 |
-
buffer.seek(0)
|
29 |
-
|
30 |
-
# Use the File API to upload the audio
|
31 |
-
audio_file = generativeai.upload_file(buffer, mime_type="audio/mp3")
|
32 |
-
|
33 |
-
# Prepare content for Gemini
|
34 |
-
content = [self.prompt]
|
35 |
-
content.append(audio_file) # Add the uploaded file reference
|
36 |
-
|
37 |
-
# Add transcript if provided
|
38 |
-
if transcript_path and transcript_path.exists():
|
39 |
-
print("Including transcript in analysis...")
|
40 |
-
# Upload transcript as a file too
|
41 |
-
transcript_file = generativeai.upload_file(transcript_path)
|
42 |
-
content.append(transcript_file)
|
43 |
-
|
44 |
-
# Generate suggestions using Gemini
|
45 |
-
response = await self.model.generate_content_async(content)
|
46 |
-
|
47 |
-
return response.text
|
48 |
-
|
49 |
-
|
50 |
-
async def main():
|
51 |
-
parser = argparse.ArgumentParser(description="Generate podcast preview suggestions")
|
52 |
-
parser.add_argument("audio_file", help="Audio file to analyze")
|
53 |
-
parser.add_argument("--transcript", "-t", help="Optional transcript file")
|
54 |
-
args = parser.parse_args()
|
55 |
-
|
56 |
-
audio_path = Path(args.audio_file)
|
57 |
-
if not audio_path.exists():
|
58 |
-
raise FileNotFoundError(f"File not found: {audio_path}")
|
59 |
-
|
60 |
-
transcript_path = Path(args.transcript) if args.transcript else None
|
61 |
-
if transcript_path and not transcript_path.exists():
|
62 |
-
print(f"Warning: Transcript file not found: {transcript_path}")
|
63 |
-
transcript_path = None
|
64 |
-
|
65 |
-
# Ensure output directory exists
|
66 |
-
output_dir = Path("output")
|
67 |
-
output_dir.mkdir(exist_ok=True)
|
68 |
-
output_path = output_dir / "previews.txt"
|
69 |
-
|
70 |
-
try:
|
71 |
-
generator = PreviewGenerator(os.getenv("GOOGLE_API_KEY"))
|
72 |
-
suggestions = await generator.generate_previews(audio_path, transcript_path)
|
73 |
-
|
74 |
-
# Save output
|
75 |
-
output_path.write_text(suggestions)
|
76 |
-
print(f"\nPreview suggestions saved to: {output_path}")
|
77 |
-
|
78 |
-
# Also print to console
|
79 |
-
print("\nPreview Suggestions:")
|
80 |
-
print("-" * 40)
|
81 |
-
print(suggestions)
|
82 |
-
|
83 |
-
except Exception as e:
|
84 |
-
print(f"Error: {e}")
|
85 |
-
return 1
|
86 |
-
|
87 |
-
return 0
|
88 |
-
|
89 |
-
|
90 |
-
if __name__ == "__main__":
|
91 |
-
import asyncio
|
92 |
-
asyncio.run(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/process_playlist.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
import asyncio
|
2 |
-
from pathlib import Path
|
3 |
-
import sys
|
4 |
-
import time
|
5 |
-
from typing import List
|
6 |
-
|
7 |
-
# Add the project root to Python path
|
8 |
-
sys.path.append(str(Path(__file__).parent.parent))
|
9 |
-
|
10 |
-
from utils.youtube_utils import get_transcript, get_playlist_video_ids
|
11 |
-
from utils.content_generator import ContentGenerator, ContentRequest
|
12 |
-
|
13 |
-
PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLd7-bHaQwnthaNDpZ32TtYONGVk95-fhF"
|
14 |
-
MAX_CONCURRENT = 3 # Limit concurrent requests
|
15 |
-
RETRY_DELAY = 65 # Seconds to wait before retrying after rate limit
|
16 |
-
|
17 |
-
async def process_video(video_id: str, generator: ContentGenerator, retry_count: int = 0) -> str:
|
18 |
-
"""Process a single video and return the formatted result."""
|
19 |
-
try:
|
20 |
-
print(f"Processing video {video_id}...")
|
21 |
-
|
22 |
-
# Get transcript
|
23 |
-
transcript = get_transcript(video_id)
|
24 |
-
if not transcript:
|
25 |
-
print(f"No transcript available for {video_id}")
|
26 |
-
return ""
|
27 |
-
|
28 |
-
# Generate suggestions
|
29 |
-
request = ContentRequest("titles_and_thumbnails")
|
30 |
-
result = await generator.generate_content(request, transcript)
|
31 |
-
return f"Video ID: {video_id}\n{result}\n{'='*50}\n"
|
32 |
-
|
33 |
-
except Exception as e:
|
34 |
-
if "rate_limit_error" in str(e) and retry_count < 3:
|
35 |
-
print(f"Rate limit hit for {video_id}, waiting {RETRY_DELAY}s before retry {retry_count + 1}")
|
36 |
-
await asyncio.sleep(RETRY_DELAY)
|
37 |
-
return await process_video(video_id, generator, retry_count + 1)
|
38 |
-
print(f"Error processing {video_id}: {e}")
|
39 |
-
return ""
|
40 |
-
|
41 |
-
async def process_batch(video_ids: List[str], generator: ContentGenerator) -> List[str]:
|
42 |
-
"""Process a batch of videos with rate limiting."""
|
43 |
-
tasks = [process_video(video_id, generator) for video_id in video_ids]
|
44 |
-
return await asyncio.gather(*tasks)
|
45 |
-
|
46 |
-
async def process_playlist():
|
47 |
-
"""Process all videos in playlist with batching."""
|
48 |
-
generator = ContentGenerator()
|
49 |
-
output_file = Path("output/playlist-titles-thumbnails.txt")
|
50 |
-
|
51 |
-
# Get videos from playlist
|
52 |
-
print("Getting videos from playlist...")
|
53 |
-
video_ids = get_playlist_video_ids(PLAYLIST_URL)
|
54 |
-
print(f"Found {len(video_ids)} videos")
|
55 |
-
|
56 |
-
# Process videos in batches
|
57 |
-
results = []
|
58 |
-
for i in range(0, len(video_ids), MAX_CONCURRENT):
|
59 |
-
batch = video_ids[i:i + MAX_CONCURRENT]
|
60 |
-
print(f"\nProcessing batch {i//MAX_CONCURRENT + 1}")
|
61 |
-
batch_results = await process_batch(batch, generator)
|
62 |
-
results.extend(batch_results)
|
63 |
-
|
64 |
-
# Add delay between batches to avoid rate limits
|
65 |
-
if i + MAX_CONCURRENT < len(video_ids):
|
66 |
-
delay = 5 # Short delay between successful batches
|
67 |
-
print(f"Waiting {delay}s before next batch...")
|
68 |
-
await asyncio.sleep(delay)
|
69 |
-
|
70 |
-
# Filter out empty results and save
|
71 |
-
results = [r for r in results if r]
|
72 |
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
73 |
-
output_file.write_text("\n".join(results))
|
74 |
-
print(f"\nResults written to {output_file}")
|
75 |
-
|
76 |
-
if __name__ == "__main__":
|
77 |
-
asyncio.run(process_playlist())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/transcript.py
DELETED
@@ -1,311 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from dataclasses import dataclass
|
3 |
-
from pathlib import Path
|
4 |
-
import json
|
5 |
-
import hashlib
|
6 |
-
import os
|
7 |
-
from typing import List, Tuple, Iterator
|
8 |
-
import assemblyai as aai
|
9 |
-
from google import generativeai
|
10 |
-
from pydub import AudioSegment
|
11 |
-
import asyncio
|
12 |
-
import io
|
13 |
-
from multiprocessing import Pool
|
14 |
-
from functools import partial
|
15 |
-
from itertools import groupby
|
16 |
-
|
17 |
-
|
18 |
-
@dataclass
|
19 |
-
class Utterance:
|
20 |
-
"""A single utterance from a speaker"""
|
21 |
-
speaker: str
|
22 |
-
text: str
|
23 |
-
start: int # timestamp in ms from AssemblyAI
|
24 |
-
end: int # timestamp in ms from AssemblyAI
|
25 |
-
|
26 |
-
@property
|
27 |
-
def timestamp(self) -> str:
|
28 |
-
"""Format start time as HH:MM:SS"""
|
29 |
-
seconds = int(self.start // 1000)
|
30 |
-
hours = seconds // 3600
|
31 |
-
minutes = (seconds % 3600) // 60
|
32 |
-
seconds = seconds % 60
|
33 |
-
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
34 |
-
|
35 |
-
|
36 |
-
class Transcriber:
|
37 |
-
"""Handles getting and caching transcripts from AssemblyAI"""
|
38 |
-
|
39 |
-
def __init__(self, api_key: str):
|
40 |
-
aai.settings.api_key = api_key
|
41 |
-
self.cache_dir = Path("output/transcripts/.cache")
|
42 |
-
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
43 |
-
|
44 |
-
def get_transcript(self, audio_path: Path) -> List[Utterance]:
|
45 |
-
"""Get transcript, using cache if available"""
|
46 |
-
cache_file = self.cache_dir / f"{audio_path.stem}.json"
|
47 |
-
|
48 |
-
if cache_file.exists():
|
49 |
-
with open(cache_file) as f:
|
50 |
-
data = json.load(f)
|
51 |
-
if data["hash"] == self._get_file_hash(audio_path):
|
52 |
-
print("Using cached AssemblyAI transcript...")
|
53 |
-
# Create proper Utterance objects from cached data
|
54 |
-
return [
|
55 |
-
Utterance(
|
56 |
-
speaker=u["speaker"],
|
57 |
-
text=u["text"],
|
58 |
-
start=u["start"],
|
59 |
-
end=u["end"]
|
60 |
-
)
|
61 |
-
for u in data["utterances"]
|
62 |
-
]
|
63 |
-
|
64 |
-
print("Getting new transcript from AssemblyAI...")
|
65 |
-
config = aai.TranscriptionConfig(speaker_labels=True, language_code="en")
|
66 |
-
transcript = aai.Transcriber().transcribe(str(audio_path), config=config)
|
67 |
-
|
68 |
-
utterances = [
|
69 |
-
Utterance(
|
70 |
-
speaker=u.speaker,
|
71 |
-
text=u.text,
|
72 |
-
start=u.start,
|
73 |
-
end=u.end
|
74 |
-
)
|
75 |
-
for u in transcript.utterances
|
76 |
-
]
|
77 |
-
|
78 |
-
# Cache the raw utterance data
|
79 |
-
cache_data = {
|
80 |
-
"hash": self._get_file_hash(audio_path),
|
81 |
-
"utterances": [
|
82 |
-
{
|
83 |
-
"speaker": u.speaker,
|
84 |
-
"text": u.text,
|
85 |
-
"start": u.start,
|
86 |
-
"end": u.end
|
87 |
-
}
|
88 |
-
for u in utterances
|
89 |
-
]
|
90 |
-
}
|
91 |
-
with open(cache_file, "w") as f:
|
92 |
-
json.dump(cache_data, f, indent=2)
|
93 |
-
|
94 |
-
return utterances
|
95 |
-
|
96 |
-
def _get_file_hash(self, file_path: Path) -> str:
|
97 |
-
"""Calculate MD5 hash of a file"""
|
98 |
-
hash_md5 = hashlib.md5()
|
99 |
-
with open(file_path, "rb") as f:
|
100 |
-
for chunk in iter(lambda: f.read(4096), b""):
|
101 |
-
hash_md5.update(chunk)
|
102 |
-
return hash_md5.hexdigest()
|
103 |
-
|
104 |
-
|
105 |
-
class Enhancer:
|
106 |
-
"""Handles enhancing transcripts using Gemini"""
|
107 |
-
|
108 |
-
def __init__(self, api_key: str):
|
109 |
-
generativeai.configure(api_key=api_key)
|
110 |
-
self.model = generativeai.GenerativeModel("gemini-exp-1206")
|
111 |
-
self.prompt = Path("prompts/enhance.txt").read_text()
|
112 |
-
|
113 |
-
async def enhance_chunks(self, chunks: List[Tuple[str, io.BytesIO]]) -> List[str]:
|
114 |
-
"""Enhance multiple transcript chunks concurrently with concurrency control"""
|
115 |
-
print(f"Enhancing {len(chunks)} chunks...")
|
116 |
-
|
117 |
-
# Create a semaphore to limit concurrent requests
|
118 |
-
semaphore = asyncio.Semaphore(3) # Allow up to 3 concurrent requests
|
119 |
-
|
120 |
-
async def process_chunk(i: int, chunk: Tuple[str, io.BytesIO]) -> str:
|
121 |
-
text, audio = chunk
|
122 |
-
async with semaphore:
|
123 |
-
audio.seek(0)
|
124 |
-
response = await self.model.generate_content_async(
|
125 |
-
[self.prompt, text, {"mime_type": "audio/mp3", "data": audio.read()}]
|
126 |
-
)
|
127 |
-
print(f"Completed chunk {i+1}/{len(chunks)}")
|
128 |
-
return response.text
|
129 |
-
|
130 |
-
# Create tasks for all chunks and run them concurrently
|
131 |
-
tasks = [
|
132 |
-
process_chunk(i, chunk)
|
133 |
-
for i, chunk in enumerate(chunks)
|
134 |
-
]
|
135 |
-
|
136 |
-
# Wait for all tasks to complete
|
137 |
-
results = await asyncio.gather(*tasks)
|
138 |
-
return results
|
139 |
-
|
140 |
-
|
141 |
-
@dataclass
|
142 |
-
class SpeakerDialogue:
|
143 |
-
"""Represents a continuous section of speech from a single speaker"""
|
144 |
-
speaker: str
|
145 |
-
utterances: List[Utterance]
|
146 |
-
|
147 |
-
@property
|
148 |
-
def start(self) -> int:
|
149 |
-
"""Start time of first utterance"""
|
150 |
-
return self.utterances[0].start
|
151 |
-
|
152 |
-
@property
|
153 |
-
def end(self) -> int:
|
154 |
-
"""End time of last utterance"""
|
155 |
-
return self.utterances[-1].end
|
156 |
-
|
157 |
-
@property
|
158 |
-
def timestamp(self) -> str:
|
159 |
-
"""Format start time as HH:MM:SS"""
|
160 |
-
return self.utterances[0].timestamp
|
161 |
-
|
162 |
-
def format(self, markdown: bool = False) -> str:
|
163 |
-
"""Format this dialogue as text with newlines between utterances
|
164 |
-
Args:
|
165 |
-
markdown: If True, add markdown formatting for speaker and timestamp
|
166 |
-
"""
|
167 |
-
texts = [u.text + "\n\n" for u in self.utterances] # Add two newlines after each utterance
|
168 |
-
combined_text = ''.join(texts).rstrip() # Remove trailing whitespace at the end
|
169 |
-
if markdown:
|
170 |
-
return f"**Speaker {self.speaker}** *{self.timestamp}*\n\n{combined_text}"
|
171 |
-
return f"Speaker {self.speaker} {self.timestamp}\n\n{combined_text}"
|
172 |
-
|
173 |
-
|
174 |
-
def group_utterances_by_speaker(utterances: List[Utterance]) -> Iterator[SpeakerDialogue]:
|
175 |
-
"""Group consecutive utterances by the same speaker"""
|
176 |
-
for speaker, group in groupby(utterances, key=lambda u: u.speaker):
|
177 |
-
yield SpeakerDialogue(speaker=speaker, utterances=list(group))
|
178 |
-
|
179 |
-
|
180 |
-
def estimate_tokens(text: str, chars_per_token: int = 4) -> int:
|
181 |
-
"""
|
182 |
-
Estimate number of tokens in text
|
183 |
-
Args:
|
184 |
-
text: The text to estimate tokens for
|
185 |
-
chars_per_token: Estimated characters per token (default 4)
|
186 |
-
"""
|
187 |
-
return (len(text) + chars_per_token - 1) // chars_per_token
|
188 |
-
|
189 |
-
|
190 |
-
def chunk_dialogues(
|
191 |
-
dialogues: Iterator[SpeakerDialogue],
|
192 |
-
max_tokens: int = 2000,
|
193 |
-
chars_per_token: int = 4
|
194 |
-
) -> List[List[SpeakerDialogue]]:
|
195 |
-
"""
|
196 |
-
Split dialogues into chunks that fit within token limit
|
197 |
-
Args:
|
198 |
-
dialogues: Iterator of SpeakerDialogues
|
199 |
-
max_tokens: Maximum tokens per chunk
|
200 |
-
chars_per_token: Estimated characters per token (default 4)
|
201 |
-
"""
|
202 |
-
chunks = []
|
203 |
-
current_chunk = []
|
204 |
-
current_text = ""
|
205 |
-
|
206 |
-
for dialogue in dialogues:
|
207 |
-
# Format this dialogue
|
208 |
-
formatted = dialogue.format()
|
209 |
-
|
210 |
-
# If adding this dialogue would exceed token limit, start new chunk
|
211 |
-
new_text = current_text + "\n\n" + formatted if current_text else formatted
|
212 |
-
if current_chunk and estimate_tokens(new_text, chars_per_token) > max_tokens:
|
213 |
-
chunks.append(current_chunk)
|
214 |
-
current_chunk = [dialogue]
|
215 |
-
current_text = formatted
|
216 |
-
else:
|
217 |
-
current_chunk.append(dialogue)
|
218 |
-
current_text = new_text
|
219 |
-
|
220 |
-
if current_chunk:
|
221 |
-
chunks.append(current_chunk)
|
222 |
-
|
223 |
-
return chunks
|
224 |
-
|
225 |
-
|
226 |
-
def format_chunk(dialogues: List[SpeakerDialogue], markdown: bool = False) -> str:
|
227 |
-
"""Format a chunk of dialogues into readable text
|
228 |
-
Args:
|
229 |
-
dialogues: List of dialogues to format
|
230 |
-
markdown: If True, add markdown formatting for speaker and timestamp
|
231 |
-
"""
|
232 |
-
return "\n\n".join(dialogue.format(markdown=markdown) for dialogue in dialogues)
|
233 |
-
|
234 |
-
|
235 |
-
def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[Tuple[str, io.BytesIO]]:
|
236 |
-
"""Prepare audio chunks and their corresponding text"""
|
237 |
-
# Group utterances by speaker and split into chunks
|
238 |
-
dialogues = group_utterances_by_speaker(utterances)
|
239 |
-
chunks = chunk_dialogues(dialogues)
|
240 |
-
print(f"Preparing {len(chunks)} audio segments...")
|
241 |
-
|
242 |
-
# Load audio once
|
243 |
-
audio = AudioSegment.from_file(audio_path)
|
244 |
-
|
245 |
-
# Process each chunk
|
246 |
-
prepared = []
|
247 |
-
for chunk in chunks:
|
248 |
-
# Extract just the needed segment
|
249 |
-
segment = audio[chunk[0].start:chunk[-1].end]
|
250 |
-
buffer = io.BytesIO()
|
251 |
-
# Use lower quality MP3 for faster processing
|
252 |
-
segment.export(buffer, format="mp3", parameters=["-q:a", "9"])
|
253 |
-
# Use non-markdown format for Gemini
|
254 |
-
prepared.append((format_chunk(chunk, markdown=False), buffer))
|
255 |
-
|
256 |
-
return prepared
|
257 |
-
|
258 |
-
|
259 |
-
def main():
|
260 |
-
parser = argparse.ArgumentParser()
|
261 |
-
parser.add_argument("audio_file", help="Audio file to transcribe")
|
262 |
-
args = parser.parse_args()
|
263 |
-
|
264 |
-
audio_path = Path(args.audio_file)
|
265 |
-
if not audio_path.exists():
|
266 |
-
raise FileNotFoundError(f"File not found: {audio_path}")
|
267 |
-
|
268 |
-
out_dir = Path("output/transcripts")
|
269 |
-
out_dir.mkdir(parents=True, exist_ok=True)
|
270 |
-
|
271 |
-
try:
|
272 |
-
# Get transcript
|
273 |
-
transcriber = Transcriber(os.getenv("ASSEMBLYAI_API_KEY"))
|
274 |
-
utterances = transcriber.get_transcript(audio_path)
|
275 |
-
|
276 |
-
# Save original transcript
|
277 |
-
dialogues = list(group_utterances_by_speaker(utterances)) # Convert iterator to list
|
278 |
-
original = format_chunk(dialogues, markdown=True) # Use markdown for final output
|
279 |
-
(out_dir / "autogenerated-transcript.md").write_text(original)
|
280 |
-
|
281 |
-
# Enhance transcript
|
282 |
-
enhancer = Enhancer(os.getenv("GOOGLE_API_KEY"))
|
283 |
-
chunks = prepare_audio_chunks(audio_path, utterances)
|
284 |
-
enhanced = asyncio.run(enhancer.enhance_chunks(chunks))
|
285 |
-
|
286 |
-
# Save enhanced transcript with markdown
|
287 |
-
merged = "\n\n".join(chunk.strip() for chunk in enhanced)
|
288 |
-
# Apply markdown formatting to the final enhanced transcript
|
289 |
-
merged = apply_markdown_formatting(merged)
|
290 |
-
(out_dir / "transcript.md").write_text(merged)
|
291 |
-
|
292 |
-
print("\nTranscripts saved to:")
|
293 |
-
print(f"- {out_dir}/autogenerated-transcript.md")
|
294 |
-
print(f"- {out_dir}/transcript.md")
|
295 |
-
|
296 |
-
except Exception as e:
|
297 |
-
print(f"Error: {e}")
|
298 |
-
return 1
|
299 |
-
|
300 |
-
return 0
|
301 |
-
|
302 |
-
|
303 |
-
def apply_markdown_formatting(text: str) -> str:
|
304 |
-
"""Apply markdown formatting to speaker and timestamp in the transcript"""
|
305 |
-
import re
|
306 |
-
pattern = r"(Speaker \w+) (\d{2}:\d{2}:\d{2})"
|
307 |
-
return re.sub(pattern, r"**\1** *\2*", text)
|
308 |
-
|
309 |
-
|
310 |
-
if __name__ == "__main__":
|
311 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# Empty file to make utils a package
|
|
|
|
utils/content_generator.py
DELETED
@@ -1,79 +0,0 @@
|
|
1 |
-
import anthropic
|
2 |
-
from dataclasses import dataclass
|
3 |
-
from pathlib import Path
|
4 |
-
import asyncio
|
5 |
-
import concurrent.futures
|
6 |
-
import time
|
7 |
-
from typing import Dict, List
|
8 |
-
import pandas as pd
|
9 |
-
|
10 |
-
client = anthropic.Anthropic()
|
11 |
-
|
12 |
-
@dataclass
|
13 |
-
class ContentRequest:
|
14 |
-
prompt_key: str
|
15 |
-
max_tokens: int = 2000
|
16 |
-
temperature: float = 1.0
|
17 |
-
|
18 |
-
class ContentGenerator:
|
19 |
-
def __init__(self):
|
20 |
-
self.current_prompts = self._load_default_prompts()
|
21 |
-
|
22 |
-
def _load_default_prompts(self) -> Dict[str, str]:
|
23 |
-
"""Load default prompts from files."""
|
24 |
-
return {
|
25 |
-
key: Path(f"prompts/{key}.txt").read_text()
|
26 |
-
for key in ["previews", "clips", "description", "timestamps", "titles_and_thumbnails"]
|
27 |
-
}
|
28 |
-
|
29 |
-
def _load_examples(self, filename: str, columns: List[str]) -> str:
|
30 |
-
"""Load examples from CSV file."""
|
31 |
-
try:
|
32 |
-
df = pd.read_csv(f"data/{filename}")
|
33 |
-
if len(columns) == 1:
|
34 |
-
return "\n\n".join(df[columns[0]].dropna().tolist())
|
35 |
-
|
36 |
-
examples = []
|
37 |
-
for _, row in df.iterrows():
|
38 |
-
if all(pd.notna(row[col]) for col in columns):
|
39 |
-
example = "\n".join(f"{col}: {row[col]}" for col in columns)
|
40 |
-
examples.append(example)
|
41 |
-
return "\n\n".join(examples)
|
42 |
-
except Exception as e:
|
43 |
-
print(f"Error loading {filename}: {str(e)}")
|
44 |
-
return ""
|
45 |
-
|
46 |
-
async def generate_content(self, request: ContentRequest, transcript: str) -> str:
|
47 |
-
"""Generate content using Claude asynchronously."""
|
48 |
-
print(f"Starting {request.prompt_key} generation...")
|
49 |
-
start_time = time.time()
|
50 |
-
|
51 |
-
example_configs = {
|
52 |
-
"clips": ("Viral Twitter Clips.csv", ["Tweet Text", "Clip Transcript"]),
|
53 |
-
"description": ("Viral Episode Descriptions.csv", ["Tweet Text"]),
|
54 |
-
"timestamps": ("Timestamps.csv", ["Timestamps"]),
|
55 |
-
"titles_and_thumbnails": ("Titles & Thumbnails.csv", ["Titles", "Thumbnail"]),
|
56 |
-
}
|
57 |
-
|
58 |
-
# Build prompt with examples
|
59 |
-
full_prompt = self.current_prompts[request.prompt_key]
|
60 |
-
if config := example_configs.get(request.prompt_key):
|
61 |
-
if examples := self._load_examples(*config):
|
62 |
-
full_prompt += f"\n\nPrevious examples:\n{examples}"
|
63 |
-
|
64 |
-
# Run API call in thread pool
|
65 |
-
loop = asyncio.get_event_loop()
|
66 |
-
with concurrent.futures.ThreadPoolExecutor() as pool:
|
67 |
-
message = await loop.run_in_executor(
|
68 |
-
pool,
|
69 |
-
lambda: client.messages.create(
|
70 |
-
model="claude-3-5-sonnet-20241022",
|
71 |
-
max_tokens=request.max_tokens,
|
72 |
-
temperature=request.temperature,
|
73 |
-
system=full_prompt,
|
74 |
-
messages=[{"role": "user", "content": [{"type": "text", "text": f"Process this transcript:\n\n{transcript}"}]}]
|
75 |
-
)
|
76 |
-
)
|
77 |
-
result = message.content[0].text
|
78 |
-
print(f"Finished {request.prompt_key} in {time.time() - start_time:.2f} seconds")
|
79 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/document_parser.py
DELETED
@@ -1,191 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
import tempfile
|
3 |
-
import os
|
4 |
-
from ebooklib import epub
|
5 |
-
from bs4 import BeautifulSoup
|
6 |
-
|
7 |
-
class DocumentParser:
|
8 |
-
"""Simple EPUB document parser that extracts chapters and their content."""
|
9 |
-
|
10 |
-
def __init__(self):
|
11 |
-
self._temp_file = None
|
12 |
-
self._book = None
|
13 |
-
self._chapters = []
|
14 |
-
|
15 |
-
def load_document(self, file_data, filename=None) -> list[str]:
|
16 |
-
"""Load an EPUB document and extract chapter titles.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
file_data: File data from Gradio (FileData object with read() method)
|
20 |
-
filename: Optional filename (not used)
|
21 |
-
"""
|
22 |
-
# Clean up any previous temp file
|
23 |
-
self.cleanup()
|
24 |
-
|
25 |
-
# Get the raw bytes from the Gradio file data
|
26 |
-
content = file_data.read() if hasattr(file_data, 'read') else file_data
|
27 |
-
|
28 |
-
# Save to temp file
|
29 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp:
|
30 |
-
temp.write(content)
|
31 |
-
self._temp_file = temp.name
|
32 |
-
|
33 |
-
# Read the EPUB
|
34 |
-
try:
|
35 |
-
self._book = epub.read_epub(self._temp_file)
|
36 |
-
print("DEBUG: Successfully read EPUB file")
|
37 |
-
except Exception as e:
|
38 |
-
print(f"DEBUG: Error reading EPUB: {str(e)}")
|
39 |
-
raise ValueError(f"Failed to read EPUB: {str(e)}")
|
40 |
-
|
41 |
-
# Extract chapters
|
42 |
-
self._chapters = self._extract_chapters()
|
43 |
-
print(f"DEBUG: Extracted {len(self._chapters)} chapters")
|
44 |
-
|
45 |
-
# Return chapter titles
|
46 |
-
return [chapter['title'] for chapter in self._chapters]
|
47 |
-
|
48 |
-
def get_chapter_content(self, chapter_idx: int) -> str:
|
49 |
-
"""Get the content of a specific chapter."""
|
50 |
-
if not self._book or not self._chapters:
|
51 |
-
raise ValueError("No document loaded")
|
52 |
-
|
53 |
-
if not 0 <= chapter_idx < len(self._chapters):
|
54 |
-
raise ValueError(f"Invalid chapter index: {chapter_idx}")
|
55 |
-
|
56 |
-
chapter = self._chapters[chapter_idx]
|
57 |
-
self._current_chapter_title = chapter['title'].strip() # Store for _get_chapter_text
|
58 |
-
|
59 |
-
print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}")
|
60 |
-
content = self._get_chapter_text(chapter['item'])
|
61 |
-
print(f"DEBUG: Extracted {len(content)} characters of content")
|
62 |
-
|
63 |
-
return content
|
64 |
-
|
65 |
-
def _extract_chapters(self) -> list[dict]:
|
66 |
-
"""Extract chapters from the EPUB file."""
|
67 |
-
chapters = []
|
68 |
-
|
69 |
-
# First try to get chapters from the table of contents
|
70 |
-
print("DEBUG: Checking table of contents...")
|
71 |
-
if hasattr(self._book, 'toc'):
|
72 |
-
# Debug the TOC structure
|
73 |
-
print("DEBUG: TOC structure:")
|
74 |
-
for item in self._book.toc:
|
75 |
-
print(f"DEBUG: TOC item type: {type(item)}")
|
76 |
-
if isinstance(item, tuple):
|
77 |
-
print(f"DEBUG: Tuple length: {len(item)}")
|
78 |
-
if len(item) > 1:
|
79 |
-
print(f"DEBUG: Second item type: {type(item[1])}")
|
80 |
-
if isinstance(item[1], (list, tuple)):
|
81 |
-
print(f"DEBUG: Sub-items count: {len(item[1])}")
|
82 |
-
|
83 |
-
def process_toc_entries(entries, level=0):
|
84 |
-
for item in entries:
|
85 |
-
# Handle both Link objects and tuples
|
86 |
-
if hasattr(item, 'title') and hasattr(item, 'href'):
|
87 |
-
# Direct Link object
|
88 |
-
doc = self._book.get_item_with_href(item.href)
|
89 |
-
if doc:
|
90 |
-
prefix = " " * level if level > 0 else ""
|
91 |
-
chapters.append({
|
92 |
-
'title': prefix + item.title,
|
93 |
-
'item': doc
|
94 |
-
})
|
95 |
-
elif isinstance(item, tuple):
|
96 |
-
section = item[0]
|
97 |
-
# Process the section
|
98 |
-
if hasattr(section, 'title') and hasattr(section, 'href'):
|
99 |
-
doc = self._book.get_item_with_href(section.href)
|
100 |
-
if doc:
|
101 |
-
prefix = " " * level if level > 0 else ""
|
102 |
-
chapters.append({
|
103 |
-
'title': prefix + section.title,
|
104 |
-
'item': doc
|
105 |
-
})
|
106 |
-
|
107 |
-
# Process sub-items if they exist
|
108 |
-
if len(item) > 1:
|
109 |
-
if isinstance(item[1], (list, tuple)):
|
110 |
-
process_toc_entries(item[1], level + 1)
|
111 |
-
elif hasattr(item[1], 'title'): # Single sub-item
|
112 |
-
process_toc_entries([item[1]], level + 1)
|
113 |
-
|
114 |
-
process_toc_entries(self._book.toc)
|
115 |
-
print(f"DEBUG: Found {len(chapters)} chapters in TOC")
|
116 |
-
print("DEBUG: Chapter titles found:")
|
117 |
-
for ch in chapters:
|
118 |
-
print(f" - {ch['title']}")
|
119 |
-
|
120 |
-
# If no chapters found in TOC, scan the documents
|
121 |
-
if not chapters:
|
122 |
-
print("DEBUG: No chapters in TOC, scanning documents...")
|
123 |
-
# Get all HTML documents
|
124 |
-
docs = [item for item in self._book.get_items()
|
125 |
-
if item.get_type() == epub.ITEM_DOCUMENT]
|
126 |
-
|
127 |
-
print(f"DEBUG: Found {len(docs)} documents to scan")
|
128 |
-
|
129 |
-
for doc in docs:
|
130 |
-
soup = BeautifulSoup(doc.get_content(), 'html.parser')
|
131 |
-
|
132 |
-
# Look for chapter headings
|
133 |
-
headings = (
|
134 |
-
soup.find_all(['h1', 'h2']) +
|
135 |
-
soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower()))
|
136 |
-
)
|
137 |
-
|
138 |
-
for heading in headings:
|
139 |
-
# Clean up the text
|
140 |
-
title = ' '.join(heading.get_text().split())
|
141 |
-
if title: # Only add if we have a title
|
142 |
-
chapters.append({
|
143 |
-
'title': title,
|
144 |
-
'item': doc
|
145 |
-
})
|
146 |
-
|
147 |
-
if not chapters:
|
148 |
-
print("DEBUG: No chapters found, using documents as chapters")
|
149 |
-
# If still no chapters found, treat each document as a chapter
|
150 |
-
for doc in self._book.get_items():
|
151 |
-
if doc.get_type() == epub.ITEM_DOCUMENT:
|
152 |
-
chapters.append({
|
153 |
-
'title': f"Chapter {len(chapters) + 1}",
|
154 |
-
'item': doc
|
155 |
-
})
|
156 |
-
|
157 |
-
return chapters
|
158 |
-
|
159 |
-
def _get_chapter_text(self, item) -> str:
|
160 |
-
"""Extract text content from a chapter."""
|
161 |
-
try:
|
162 |
-
soup = BeautifulSoup(item.get_content(), 'html.parser')
|
163 |
-
|
164 |
-
# Remove script and style elements
|
165 |
-
for element in soup(['script', 'style']):
|
166 |
-
element.decompose()
|
167 |
-
|
168 |
-
# Get main content area (usually in body or main tags)
|
169 |
-
content_area = soup.find('body') or soup.find('main') or soup
|
170 |
-
|
171 |
-
# Get all text blocks, excluding navigation elements
|
172 |
-
text_blocks = []
|
173 |
-
for element in content_area.find_all(text=True, recursive=True):
|
174 |
-
if (element.parent.name not in ['script', 'style', 'nav', 'header'] and
|
175 |
-
element.strip()):
|
176 |
-
text_blocks.append(element.strip())
|
177 |
-
|
178 |
-
return '\n\n'.join(text_blocks)
|
179 |
-
|
180 |
-
except Exception as e:
|
181 |
-
print(f"DEBUG: Error extracting text: {str(e)}")
|
182 |
-
# Fallback to simple text extraction
|
183 |
-
return soup.get_text(separator='\n\n', strip=True)
|
184 |
-
|
185 |
-
def cleanup(self):
|
186 |
-
"""Clean up temporary files."""
|
187 |
-
if self._temp_file and os.path.exists(self._temp_file):
|
188 |
-
os.unlink(self._temp_file)
|
189 |
-
self._temp_file = None
|
190 |
-
self._book = None
|
191 |
-
self._chapters = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/youtube_utils.py
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
2 |
-
from pytube import Playlist
|
3 |
-
import re
|
4 |
-
from typing import Optional, List
|
5 |
-
|
6 |
-
def extract_video_id(url: str) -> Optional[str]:
|
7 |
-
"""Extract video ID from various YouTube URL formats."""
|
8 |
-
match = re.search(
|
9 |
-
r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([A-Za-z0-9_-]+)",
|
10 |
-
url
|
11 |
-
)
|
12 |
-
return match.group(1) if match else None
|
13 |
-
|
14 |
-
def get_transcript(video_id: str) -> str:
|
15 |
-
"""Get transcript from YouTube video ID."""
|
16 |
-
try:
|
17 |
-
transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(["en"])
|
18 |
-
return " ".join(entry["text"] for entry in transcript.fetch())
|
19 |
-
except Exception as e:
|
20 |
-
print(f"Error fetching transcript for {video_id}: {str(e)}")
|
21 |
-
return ""
|
22 |
-
|
23 |
-
def get_playlist_video_ids(playlist_url: str) -> List[str]:
|
24 |
-
"""Get all video IDs from a YouTube playlist."""
|
25 |
-
playlist = Playlist(playlist_url)
|
26 |
-
return [url.split("watch?v=")[1] for url in playlist.video_urls]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|