einanao commited on
Commit
0e1487a
·
1 Parent(s): 395a84b

add example + docs

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +60 -21
  3. example.png +3 -0
.gitattributes CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ *.png filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -164,7 +164,7 @@ def format_duration(duration):
164
 
165
  def strike(url, speedup_factor, min_speedup, max_speedup, max_num_segments):
166
 
167
- min_speedup = max(0.5, min_speedup) # ffmpeg limit
168
 
169
  with st.spinner("downloading..."):
170
  name = download(url, YDL_OPTS)
@@ -207,13 +207,6 @@ def strike(url, speedup_factor, min_speedup, max_speedup, max_num_segments):
207
  with st.spinner("stitching segments..."):
208
  cat_clips(squashed_times, speedups, audio_path, output_path)
209
 
210
- spedup_total_duration, actual_speedup_factor = compute_actual_speedup(
211
- squashed_durations, speedups, total_duration
212
- )
213
- st.write("original duration: %s" % format_duration(total_duration))
214
- st.write("new duration: %s" % format_duration(spedup_total_duration))
215
- st.write("speedup: %0.2f" % actual_speedup_factor)
216
-
217
  times = np.array([(seg["start"] + seg["end"]) / 2 for seg in segments])
218
  times /= 60
219
  annotations = [seg["text"] for seg in segments]
@@ -240,6 +233,10 @@ def strike(url, speedup_factor, min_speedup, max_speedup, max_num_segments):
240
  )
241
  )
242
  st.altair_chart((lines + dots).interactive(), use_container_width=True)
 
 
 
 
243
 
244
  times = sum([list(x) for x in squashed_times], [])
245
  times = np.array(times)
@@ -251,7 +248,7 @@ def strike(url, speedup_factor, min_speedup, max_speedup, max_num_segments):
251
  max_actual_speedups = max(speedups)
252
  eps = 0.1
253
  lines = (
254
- alt.Chart(df, title="speedup based on information rate")
255
  .mark_line()
256
  .encode(
257
  x=alt.X(cols[0], scale=alt.Scale(domain=(min_time, max_time))),
@@ -265,26 +262,68 @@ def strike(url, speedup_factor, min_speedup, max_speedup, max_num_segments):
265
  )
266
  st.altair_chart(lines.interactive(), use_container_width=True)
267
 
268
- return output_path
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  with st.form("my_form"):
272
  url = st.text_input(
273
  "youtube url", value="https://www.youtube.com/watch?v=_3MBQm7GFIM"
274
  )
275
  speedup_factor = st.slider("speedup", min_value=1.0, max_value=10.0, value=1.5)
276
- min_speedup = 1
277
- max_speedup = st.slider("maximum speedup", min_value=1.0, max_value=10.0, value=2.0)
278
- speedup_factor = min(speedup_factor, max_speedup)
279
  max_num_segments = st.slider(
280
  "variance in speedup over time", min_value=2, max_value=100, value=20
281
  )
282
  submitted = st.form_submit_button("submit")
283
- if submitted:
284
- st.write("original video:")
285
- st.video(url)
286
- output_path = strike(
287
- url, speedup_factor, min_speedup, max_speedup, max_num_segments
288
- )
289
- st.write("processed audio:")
290
- st.audio(output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  def strike(url, speedup_factor, min_speedup, max_speedup, max_num_segments):
166
 
167
+ assert min_speedup >= 0.5 # ffmpeg limit
168
 
169
  with st.spinner("downloading..."):
170
  name = download(url, YDL_OPTS)
 
207
  with st.spinner("stitching segments..."):
208
  cat_clips(squashed_times, speedups, audio_path, output_path)
209
 
 
 
 
 
 
 
 
210
  times = np.array([(seg["start"] + seg["end"]) / 2 for seg in segments])
211
  times /= 60
212
  annotations = [seg["text"] for seg in segments]
 
233
  )
234
  )
235
  st.altair_chart((lines + dots).interactive(), use_container_width=True)
236
+ st.info("hover over the dots in the plot above this message to read the transcript")
237
+
238
+ st.write("sped-up audio:")
239
+ st.audio(output_path)
240
 
241
  times = sum([list(x) for x in squashed_times], [])
242
  times = np.array(times)
 
248
  max_actual_speedups = max(speedups)
249
  eps = 0.1
250
  lines = (
251
+ alt.Chart(df, title="adaptive speedup based on information rate")
252
  .mark_line()
253
  .encode(
254
  x=alt.X(cols[0], scale=alt.Scale(domain=(min_time, max_time))),
 
262
  )
263
  st.altair_chart(lines.interactive(), use_container_width=True)
264
 
 
265
 
266
+ st.markdown(
267
+ """
268
+ ## cobra
269
+ cobra stands for (co)nstant (b)it-(r)ate (a)udio.
270
+ it's a tool for speeding up audio from podcasts and lectures.
271
+ instead of applying the same speedup (like 1.5x) to the entire file,
272
+ it applies a higher speedup to parts of the file with less information content
273
+ , and a lower speedup to parts with higher information content.
274
+ it measures information content using a language model.
275
+
276
+ ## usage
277
+ 1. enter a youtube url
278
+ 2. specify your desired overall speedup
279
+ 3. specify your minimum speedup. no segment of the file will be sped up slower than this.
280
+ 4. specify your maximum speedup. no segment of the file will be sped up faster than this.
281
+ 5. specify how much variance you'd like to see in the speedup over time (2 = constant speedup throughout the file, 100 = frequently-changing speedup)
282
+ 6. hit submit
283
+ 7. wait for the charts and processed audio to appear. it can take a while to download, transcribe, calculate information density, and stitch segments.
284
+ """
285
+ )
286
 
287
  with st.form("my_form"):
288
  url = st.text_input(
289
  "youtube url", value="https://www.youtube.com/watch?v=_3MBQm7GFIM"
290
  )
291
  speedup_factor = st.slider("speedup", min_value=1.0, max_value=10.0, value=1.5)
292
+ min_speedup = st.slider("minimum speedup", min_value=0.5, max_value=5.0, value=1.0)
293
+ max_speedup = st.slider("maximum speedup", min_value=0.5, max_value=5.0, value=2.0)
 
294
  max_num_segments = st.slider(
295
  "variance in speedup over time", min_value=2, max_value=100, value=20
296
  )
297
  submitted = st.form_submit_button("submit")
298
+ if min_speedup <= speedup_factor and speedup_factor <= max_speedup:
299
+ if submitted:
300
+ st.write("original video:")
301
+ st.video(url)
302
+ strike(url, speedup_factor, min_speedup, max_speedup, max_num_segments)
303
+ else:
304
+ st.write("speedup must be between min and max")
305
+
306
+ st.markdown(
307
+ """
308
+ ## example
309
+ """
310
+ )
311
+ st.image(
312
+ "example.png",
313
+ width=400,
314
+ caption="the information rate is lower in the first half of the video, when they are bs'ing and using buzzwords, so the speedup is higher. the information rate is higher in the second half of the video, when they walk through a concrete example of codex solving a challenging programming problem, so the speedup is lower.",
315
+ )
316
+
317
+ st.markdown(
318
+ """
319
+ ## algorithm
320
+ 1. download the audio of a youtube video (e.g., a podcast or lecture) using [youtube-dl](https://youtube-dl.org/)
321
+ 2. use [whisper](https://github.com/openai/whisper) to transcribe the audio into text
322
+ 3. use the [flan-t5](https://huggingface.co/docs/transformers/model_doc/flan-t5) language model to compute the negative log-likelihood of each text token given the previous tokens
323
+ 4. compute the information rate of each text segment in the transcript: negative log-likelihood of all tokens in segment divided by duration of segment
324
+ 5. fit a piecewise-constant function to the information rate vs. time data using a decision tree regression model from [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html). this lets us control the number of segments that will be stitched together in step 8, which can run slowly if the number of segments is too large.
325
+ 6. compute speedup for each segment: 1 / information rate (induces constant bit-rate over time)
326
+ 7. clip speedups with user's min and max, and use binary search to find linear scaling factor that matches the user's desired overall speedup
327
+ 8. apply scaled and clipped speedups to each segment, and stitch the segments together using ffmpeg
328
+ """
329
+ )
example.png ADDED

Git LFS Details

  • SHA256: a8e2dc0ec6abbcf8ae7af071749ff0c94939e786c627fbaf84edfcd16d8eb2ce
  • Pointer size: 132 Bytes
  • Size of remote file: 1.36 MB