Spaces:

dwarkesh
/

producer

Running

App Files Files Community

dwarkesh commited on Dec 22, 2024

Commit

c5ee948

1 Parent(s): aead542

re-orged the repo

Browse files

Files changed (13) hide show

app.py +133 -165
{source → data}/Timestamps.csv +0 -0
{source → data}/Titles & Thumbnails.csv +0 -0
{source → data}/Viral Episode Descriptions.csv +0 -0
{source → data}/Viral Twitter Clips.csv +0 -0
prompts/clips.txt +5 -0
prompts/description.txt +4 -0
prompt.txt → prompts/enhance.txt +0 -0
prompts/timestamps.txt +11 -0
prompts/titles_and_thumbnails.txt +23 -0
transcript.py → scripts/transcript.py +2 -2
source/.DS_Store +0 -0
test.txt +0 -53

app.py CHANGED Viewed

@@ -1,181 +1,150 @@
 import gradio as gr
 import anthropic
 import pandas as pd
-from typing import Tuple, Dict
 from youtube_transcript_api import YouTubeTranscriptApi
 import re
 # Initialize Anthropic client
 client = anthropic.Anthropic()
-# Default prompts that we can experiment with
-DEFAULT_PROMPTS = {
-    "clips": """You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.
-Format as:
-Tweet 1
-Tweet Text: [text]
-Clip Transcript: [45-120 seconds of transcript]
-Previous examples:
-{clips_examples}""",
-    "description": """Create an engaging episode description tweet (280 chars max) that:
-1. Highlights compelling aspects
-2. Includes topic areas and handles
-3. Ends with "Links below" or "Enjoy!"
-Previous examples:
-{description_examples}""",
-    "timestamps": """Generate timestamps (HH:MM:SS) every 3-8 minutes covering key transitions and moments.
-Use 2-6 word descriptions.
-Start at 00:00:00.
-Previous examples:
-{timestamps_examples}""",
-    "titles_and_thumbnails": """Create 3-5 compelling title-thumbnail combinations that tell a story.
-Title Format: "Guest Name – Key Story or Core Insight"
-Thumbnail: 2-4 ALL CAPS words that create intrigue with the title
-Example: "David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
-Thumbnail: "LAST HUMANS STANDING"
-The combination should create intellectual curiosity without clickbait.
-Previous examples:
-{titles_and_thumbnails_examples}""",
-}
-# Current prompts used in the session
-current_prompts = DEFAULT_PROMPTS.copy()
-def load_examples(filename: str, columns: list) -> str:
-    """Load examples from CSV file."""
-    try:
-        df = pd.read_csv(f"source/{filename}")
-        if len(columns) == 1:
-            examples = df[columns[0]].dropna().tolist()
             return "\n\n".join(examples)
-        examples = []
-        for _, row in df.iterrows():
-            if all(pd.notna(row[col]) for col in columns):
-                example = "\n".join(f"{col}: {row[col]}" for col in columns)
-                examples.append(example)
-        return "\n\n".join(examples)
-    except Exception as e:
-        print(f"Error loading {filename}: {str(e)}")
-        return ""
-def generate_content(
-    prompt_key: str, transcript: str, max_tokens: int = 1000, temp: float = 0.6
-) -> str:
-    """Generate content using Claude."""
-    examples = {
-        "clips": load_examples(
-            "Viral Twitter Clips.csv", ["Tweet Text", "Clip Transcript"]
-        ),
-        "description": load_examples("Viral Episode Descriptions.csv", ["Tweet Text"]),
-        "timestamps": load_examples("Timestamps.csv", ["Timestamps"]),
-        "titles_and_thumbnails": load_examples(
-            "Titles & Thumbnails.csv", ["Titles", "Thumbnail"]
-        ),
-    }
-    message = client.messages.create(
-        model="claude-3-5-sonnet-20241022",
-        max_tokens=max_tokens,
-        temperature=temp,
-        system=current_prompts[prompt_key].format(
-            **{f"{prompt_key}_examples": examples[prompt_key]}
-        ),
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"Process this transcript:\n\n{transcript}",
-                    }
-                ],
-            }
-        ],
-    )
-    return message.content[0].text
-def get_youtube_transcript(url: str) -> str:
-    """Get transcript from YouTube URL."""
-    try:
-        video_id = re.search(
-            r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([A-Za-z0-9_-]+)",
-            url,
-        ).group(1)
-        transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(
-            ["en"]
-        )
-        return " ".join(entry["text"] for entry in transcript.fetch())
-    except Exception as e:
-        raise Exception(f"Error fetching YouTube transcript: {str(e)}")
-def process_transcript(input_text: str) -> Tuple[str, str, str, str]:
-    """Process input and generate all content."""
-    try:
-        # Get transcript from URL or use direct input
-        transcript = (
-            get_youtube_transcript(input_text)
-            if any(x in input_text for x in ["youtube.com", "youtu.be"])
-            else input_text
-        )
-        # Generate all content types
-        return (
-            generate_content("clips", transcript, max_tokens=8192),
-            generate_content("description", transcript),
-            generate_content("timestamps", transcript, temp=0.4),
-            generate_content("titles_and_thumbnails", transcript, temp=0.7),
-        )
-    except Exception as e:
-        error_msg = f"Error processing input: {str(e)}"
-        return (error_msg,) * 4
-def update_prompts(*values) -> str:
-    """Update the current session's prompts."""
-    global current_prompts
-    current_prompts = {
-        "clips": values[0],
-        "description": values[1],
-        "timestamps": values[2],
-        "titles_and_thumbnails": values[3],
-    }
-    return (
-        "Prompts updated for this session! Changes will reset when you reload the page."
-    )
 def create_interface():
     """Create the Gradio interface."""
     with gr.Blocks(title="Podcast Transcript Analyzer") as app:
         with gr.Tab("Generate Content"):
             gr.Markdown("# Podcast Content Generator")
-            input_text = gr.Textbox(
-                label="Input", placeholder="YouTube URL or transcript...", lines=10
-            )
             submit_btn = gr.Button("Generate Content")
             outputs = [
-                gr.Textbox(label="Twitter Clips", lines=10, interactive=False),
-                gr.Textbox(label="Twitter Description", lines=3, interactive=False),
-                gr.Textbox(label="Timestamps", lines=10, interactive=False),
-                gr.Textbox(
-                    label="Title & Thumbnail Suggestions", lines=10, interactive=False
-                ),
             ]
-            submit_btn.click(
-                fn=process_transcript, inputs=[input_text], outputs=outputs
-            )
         with gr.Tab("Experiment with Prompts"):
             gr.Markdown("# Experiment with Prompts")
@@ -190,42 +159,41 @@ def create_interface():
             prompt_inputs = [
                 gr.Textbox(
-                    label="Clips Prompt", lines=10, value=DEFAULT_PROMPTS["clips"]
                 ),
                 gr.Textbox(
                     label="Description Prompt",
                     lines=10,
-                    value=DEFAULT_PROMPTS["description"],
                 ),
                 gr.Textbox(
                     label="Timestamps Prompt",
                     lines=10,
-                    value=DEFAULT_PROMPTS["timestamps"],
                 ),
                 gr.Textbox(
                     label="Titles & Thumbnails Prompt",
                     lines=10,
-                    value=DEFAULT_PROMPTS["titles_and_thumbnails"],
                 ),
             ]
             status = gr.Textbox(label="Status", interactive=False)
             # Update prompts when they change
             for prompt in prompt_inputs:
-                prompt.change(fn=update_prompts, inputs=prompt_inputs, outputs=[status])
             # Reset button
             reset_btn = gr.Button("Reset to Default Prompts")
             reset_btn.click(
                 fn=lambda: (
-                    update_prompts(*DEFAULT_PROMPTS.values()),
-                    *DEFAULT_PROMPTS.values(),
                 ),
                 outputs=[status] + prompt_inputs,
             )
     return app
 if __name__ == "__main__":
     create_interface().launch()

 import gradio as gr
 import anthropic
 import pandas as pd
+from typing import Tuple, Dict, List
 from youtube_transcript_api import YouTubeTranscriptApi
 import re
+from pathlib import Path
+import asyncio
+import concurrent.futures
+from dataclasses import dataclass
+import time
 # Initialize Anthropic client
 client = anthropic.Anthropic()
+@dataclass
+class ContentRequest:
+    prompt_key: str
+    max_tokens: int = 2000
+    temperature: float = 0.6
+class TranscriptProcessor:
+    def __init__(self):
+        self.current_prompts = self._load_default_prompts()
+    def _load_default_prompts(self) -> Dict[str, str]:
+        """Load default prompts from files."""
+        return {
+            key: Path(f"prompts/{key}.txt").read_text()
+            for key in ["clips", "description", "timestamps", "titles_and_thumbnails"]
+        }
+    def _load_examples(self, filename: str, columns: List[str]) -> str:
+        """Load examples from CSV file."""
+        try:
+            df = pd.read_csv(f"data/{filename}")
+            if len(columns) == 1:
+                return "\n\n".join(df[columns[0]].dropna().tolist())
+            examples = []
+            for _, row in df.iterrows():
+                if all(pd.notna(row[col]) for col in columns):
+                    example = "\n".join(f"{col}: {row[col]}" for col in columns)
+                    examples.append(example)
             return "\n\n".join(examples)
+        except Exception as e:
+            print(f"Error loading {filename}: {str(e)}")
+            return ""
+    async def _generate_content(self, request: ContentRequest, transcript: str) -> str:
+        """Generate content using Claude asynchronously."""
+        print(f"Starting {request.prompt_key} generation...")
+        start_time = time.time()
+        example_configs = {
+            "clips": ("Viral Twitter Clips.csv", ["Tweet Text", "Clip Transcript"]),
+            "description": ("Viral Episode Descriptions.csv", ["Tweet Text"]),
+            "timestamps": ("Timestamps.csv", ["Timestamps"]),
+            "titles_and_thumbnails": ("Titles & Thumbnails.csv", ["Titles", "Thumbnail"]),
+        }
+        # Build prompt with examples
+        full_prompt = self.current_prompts[request.prompt_key]
+        if config := example_configs.get(request.prompt_key):
+            if examples := self._load_examples(*config):
+                full_prompt += f"\n\nPrevious examples:\n{examples}"
+        # Run API call in thread pool
+        loop = asyncio.get_event_loop()
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            message = await loop.run_in_executor(
+                pool,
+                lambda: client.messages.create(
+                    model="claude-3-5-sonnet-20241022",
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                    system=full_prompt,
+                    messages=[{"role": "user", "content": [{"type": "text", "text": f"Process this transcript:\n\n{transcript}"}]}]
+                )
+            )
+        result = message.content[0].text
+        print(f"Finished {request.prompt_key} in {time.time() - start_time:.2f} seconds")
+        return result
+    def _get_youtube_transcript(self, url: str) -> str:
+        """Get transcript from YouTube URL."""
+        try:
+            video_id = re.search(
+                r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([A-Za-z0-9_-]+)",
+                url
+            ).group(1)
+            transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript(["en"])
+            return " ".join(entry["text"] for entry in transcript.fetch())
+        except Exception as e:
+            raise Exception(f"Error fetching YouTube transcript: {str(e)}")
+    async def process_transcript(self, input_text: str) -> Tuple[str, str, str, str]:
+        """Process input and generate all content."""
+        try:
+            # Get transcript from URL or use direct input
+            transcript = (
+                self._get_youtube_transcript(input_text)
+                if any(x in input_text for x in ["youtube.com", "youtu.be"])
+                else input_text
+            )
+            # Define content generation requests
+            requests = [
+                ContentRequest("clips", max_tokens=8192),
+                ContentRequest("description"),
+                ContentRequest("timestamps", temperature=0.4),
+                ContentRequest("titles_and_thumbnails", temperature=0.7),
+            ]
+            # Generate all content concurrently
+            results = await asyncio.gather(
+                *[self._generate_content(req, transcript) for req in requests]
+            )
+            return tuple(results)
+        except Exception as e:
+            return (f"Error processing input: {str(e)}",) * 4
+    def update_prompts(self, *values) -> str:
+        """Update the current session's prompts."""
+        keys = ["clips", "description", "timestamps", "titles_and_thumbnails"]
+        self.current_prompts = dict(zip(keys, values))
+        return "Prompts updated for this session! Changes will reset when you reload the page."
 def create_interface():
     """Create the Gradio interface."""
+    processor = TranscriptProcessor()
     with gr.Blocks(title="Podcast Transcript Analyzer") as app:
         with gr.Tab("Generate Content"):
             gr.Markdown("# Podcast Content Generator")
+            input_text = gr.Textbox(label="Input", placeholder="YouTube URL or transcript...", lines=10)
             submit_btn = gr.Button("Generate Content")
             outputs = [
+                gr.Textbox(label=label, lines=10, interactive=False)
+                for label in ["Twitter Clips", "Twitter Description", "Timestamps", "Title & Thumbnail Suggestions"]
             ]
+            async def process_wrapper(text):
+                return await processor.process_transcript(text)
+            submit_btn.click(fn=process_wrapper, inputs=[input_text], outputs=outputs)
         with gr.Tab("Experiment with Prompts"):
             gr.Markdown("# Experiment with Prompts")
             prompt_inputs = [
                 gr.Textbox(
+                    label="Clips Prompt", lines=10, value=processor.current_prompts["clips"]
                 ),
                 gr.Textbox(
                     label="Description Prompt",
                     lines=10,
+                    value=processor.current_prompts["description"],
                 ),
                 gr.Textbox(
                     label="Timestamps Prompt",
                     lines=10,
+                    value=processor.current_prompts["timestamps"],
                 ),
                 gr.Textbox(
                     label="Titles & Thumbnails Prompt",
                     lines=10,
+                    value=processor.current_prompts["titles_and_thumbnails"],
                 ),
             ]
             status = gr.Textbox(label="Status", interactive=False)
             # Update prompts when they change
             for prompt in prompt_inputs:
+                prompt.change(fn=processor.update_prompts, inputs=prompt_inputs, outputs=[status])
             # Reset button
             reset_btn = gr.Button("Reset to Default Prompts")
             reset_btn.click(
                 fn=lambda: (
+                    processor.update_prompts(*processor.current_prompts.values()),
+                    *processor.current_prompts.values(),
                 ),
                 outputs=[status] + prompt_inputs,
             )
     return app
 if __name__ == "__main__":
     create_interface().launch()

{source → data}/Timestamps.csv RENAMED Viewed

File without changes

{source → data}/Titles & Thumbnails.csv RENAMED Viewed

File without changes

{source → data}/Viral Episode Descriptions.csv RENAMED Viewed

File without changes

{source → data}/Viral Twitter Clips.csv RENAMED Viewed

File without changes

prompts/clips.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.
+Format as:
+Tweet 1
+Tweet Text: [text]
+Clip Transcript: [45-120 seconds of transcript]

prompts/description.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Create an engaging episode description tweet (280 chars max) that:
+1. Highlights compelling aspects
+2. Includes topic areas and handles
+3. Ends with "Links below" or "Enjoy!"

prompt.txt → prompts/enhance.txt RENAMED Viewed

File without changes

prompts/timestamps.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+You are a podcast timestamp generator. Create 5-7 timestamps for this episode, following these rules:
+- Space timestamps roughly 10 minutes apart
+- Use only 1-3 words per timestamp
+- Focus on the most important discussion points
+- Use this format exactly: "0:00 First Topic"
+- Skip minor tangents or small talk
+Output the timestamps in chronological order, one per line.
+Previous examples:
+{timestamps_examples}

prompts/titles_and_thumbnails.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+Create 3-5 compelling title-thumbnail combinations that tell an intellectually fascinating story.
+The goal is to capture the most mind-blowing insight or narrative from the episode that would make curious people think "I HAVE to hear this story."
+Title Format: "Guest Name – [The Most Intriguing Story/Insight from the Episode]"
+- Focus on one powerful story/insight rather than a list of topics
+- The title should make people wonder "How is that possible?" or "I need to know more"
+- Avoid generic listicles like "Guest - Topic 1, Topic 2, & Topic 3"
+- Never use clickbait or culture war bait
+Thumbnail: 2-4 ALL CAPS words that amplify the intrigue
+- Should work together with the title to tell a story
+- Create intellectual curiosity without sensationalism
+- Make the viewer wonder "What's the story here?"
+Example:
+Title: "David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
+Thumbnail: "LAST HUMANS STANDING"
+Why it works: Creates genuine curiosity about an epic historical story. The thumbnail adds mystery - which tribe? why did they survive when others didn't?
+Bad Example:
+Title: "David Reich - Human Evolution, Neanderthals, & The Yamnaya"
+Why it's weak: Generic list of topics, doesn't tell a story or create intrigue

transcript.py → scripts/transcript.py RENAMED Viewed

@@ -102,8 +102,8 @@ class Enhancer:
         generativeai.configure(api_key=api_key)
         self.model = generativeai.GenerativeModel("gemini-exp-1206")
-        # Load prompt template
-        prompt_path = Path(__file__).parent / "prompt.txt"
         self.prompt = prompt_path.read_text()
     async def enhance_chunks(self, chunks: List[tuple[str, io.BytesIO]]) -> List[str]:

         generativeai.configure(api_key=api_key)
         self.model = generativeai.GenerativeModel("gemini-exp-1206")
+        # Update prompt path
+        prompt_path = Path("prompts/enhance.txt")
         self.prompt = prompt_path.read_text()
     async def enhance_chunks(self, chunks: List[tuple[str, io.BytesIO]]) -> List[str]:

source/.DS_Store DELETED Viewed

Binary file (6.15 kB)

test.txt DELETED Viewed

@@ -1,53 +0,0 @@
-Speaker A 00:00:00
-Today, I'm chatting with Adam Brown, a founder and lead of the Blueshift team at Google DeepMind, which is cracking maths and reasoning, and a theoretical physicist at Stanford. Adam, welcome.
-Speaker B 00:00:11
-Super excited to be here. Let's do this.
-Speaker A 00:00:13
-First question. What is going to be the ultimate fate of the universe? And how much confidence should we have?
-Speaker B 00:00:19
-I think it depends on physics we don't yet fully understand because the ultimate fate is a long time away. That extends a long way out into the future. It also probably depends on us. It's probably in our hands, depending on how the unknown physics breaks out.
-Our idea of the answer to that question has changed quite a lot over the last century. In the 1930s, when they turned on the big telescopes, they discovered that the universe was expanding, which they were not previously aware of. The question is, how fast is it expanding?
-Then in the 1990s, we discovered something that really surprised us. There had been a learned debate up to that point about whether it was expanding so slowly that it would just expand and then recollapse in a big crunch or whether it was expanding sufficiently fast that it would just keep going forever, maybe slowing down in its expansion but not growing forever.
-Then, in possibly the worst day in human history in terms of expected value, in the 90s, we discovered something that had not been anticipated: not only is it expanding, but the rate at which it's expanding is accelerating. It's getting faster and faster as it expands. This is what's called a cosmological constant or dark energy.
-That completely changes the answer to the question, "What is the ultimate fate?" if it's really there. Because it means that distant galaxies, galaxies that are more than maybe 20 billion light-years away from us right now, are being dragged away from us by the expansion of the universe. We'll never reach them. We'll never get to them because even if we headed towards them at the speed of light, the expansion of the universe is dragging them away faster than we'll be able to catch up with them.
-That's really bad news because we have plans for those galaxies. Maybe we could go get them and turn them into tropical Edos or computronium or whatever we had a plan for. We can't if the cosmological constant is really there because they're being dragged away from us by the expansion of the universe.
-So how confident of that picture should we be? In answer to your question, according to that picture, eventually, the ultimate fate will just be that these universes get dragged away. Only the galaxies that are currently within a dozen billion light-years of us will we be able to reach.
-Speaker A 00:02:57
-Wait, a dozen light-years?
-Speaker B 00:02:58
-Sorry, a dozen billion light-years. A dozen light-years is not many other galaxies.
-Maybe a dozen billion light-years, those ones we'll be able to run out and grab. But anything beyond that is just going to be dragged away from us by the cosmological constant. So that's just a finite number of galaxies and a finite amount of resources.
-But then you ask, how confident should we be? On first principles grounds, you should not be particularly confident in that answer at all. We've had a number of radical reimaginings of what the expansion and fate of the universe is in the last century, including in my lifetime.
-So just on first principles grounds, you might imagine that you shouldn't be very confident, and indeed you shouldn't. We're not totally confident that the dark energy that currently seems to be pushing the universe apart is indeed going to be a feature of our universe forever. Things could change a lot.
-Including, you could imagine that a future civilization could manipulate the cosmological constant and bleed it away or manipulate it in some way in order to avoid the heat death.
-Speaker A 00:04:10
-Can you say more about that? How would one do this, and how far would it apply? How much would it expand the cosmic horizon?
-Speaker B 00:04:18
-Now we're getting to more speculative levels, but it does seem to be a feature of our best theories, a completely untested feature, but a feature nevertheless, of our best theories that combine quantum mechanics and gravity that the cosmological constant isn't just some fixed value.
-In fact, it can take different values, the amount of dark energy, the energy density, and dark energy in what's called different vacuums. For example, string theory has this property that there are many, many vacuums, if string theory is correct, in which the cosmological constant can take very different values. And that perhaps provides some hope.