Spaces:
Runtime error
Runtime error
| import os | |
| from pydub import AudioSegment | |
| import openai | |
| from openai import OpenAI | |
| import feedparser | |
| from pathlib import Path | |
| import wikipedia | |
| import json | |
| openai_audio = OpenAI() | |
| # def load_whisper_api(audio): | |
| # '''Transcribe YT audio to text using Open AI API''' | |
| # import openai | |
| # file = open(audio, "rb") | |
| # transcript = openai.Audio.translate("whisper-1", file) | |
| # return transcript | |
| def load_whisper_api(audio): | |
| '''Transcribe YT audio to text using Open AI API''' | |
| file = open(audio, "rb") | |
| transcript = openai_audio.audio.transcriptions.create(model="whisper-1", file=file,response_format="text") | |
| return transcript | |
| def get_transcribe_podcast(rss_url, local_path): | |
| st.info("Starting Podcast Transcription Function...") | |
| print("Feed URL: ", rss_url) | |
| print("Local Path:", local_path) | |
| # Download the podcast episode by parsing the RSS feed | |
| p = Path(local_path) | |
| p.mkdir(exist_ok=True) | |
| st.info("Downloading the podcast episode...") | |
| with requests.get(rss_url, stream=True) as r: | |
| r.raise_for_status() | |
| episode_path = p.joinpath(episode_name) | |
| with open(episode_path, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| st.info("Podcast Episode downloaded") | |
| # Perform the transcription | |
| st.info("Starting podcast transcription") | |
| audio_file = local_path + episode_name | |
| #Get size of audio file | |
| audio_size = round(os.path.getsize(audio_file)/(1024*1024),1) | |
| #Check if file is > 24mb, if not then use Whisper API | |
| if audio_size <= 25: | |
| #Use whisper API | |
| results = load_whisper_api(audio_file)['text'] | |
| else: | |
| st.info('File size larger than 24mb, applying chunking and transcription') | |
| song = AudioSegment.from_file(audio_file, format='mp3') | |
| # PyDub handles time in milliseconds | |
| twenty_minutes = 20 * 60 * 1000 | |
| chunks = song[::twenty_minutes] | |
| transcriptions = [] | |
| for i, chunk in enumerate(chunks): | |
| chunk.export(f'chunk_{i}.mp3', format='mp3') | |
| transcriptions.append(load_whisper_api(f'chunk_{i}.mp3')['text']) | |
| results = ','.join(transcriptions) | |
| # Return the transcribed text | |
| st.info("Podcast transcription completed, returning results...") | |
| return results | |
| def get_podcast_summary(podcast_transcript): | |
| instructPrompt = """ | |
| You are a podcast analyst and your main task is to summarize the key and important points of | |
| the podcast for a busy professional by highlighting the main and important points | |
| to ensure the professional has a sufficient summary of the podcast. Include any questions you consider important or | |
| any points that warrant further investigation. | |
| Please use bulletpoints. | |
| """ | |
| request = instructPrompt + podcast_transcript | |
| chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", | |
| messages=[{"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": request} | |
| ] | |
| ) | |
| podcastSummary = chatOutput.choices[0].message.content | |
| return podcastSummary | |
| def get_podcast_guest(podcast_transcript): | |
| '''Get guest name, professional title, organization name''' | |
| completion = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo-16k", | |
| messages=[{"role": "user", "content": podcast_transcript}], | |
| functions=[ | |
| { | |
| "name": "get_podcast_guest_information", | |
| "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "guest_name": { | |
| "type": "string", | |
| "description": "The full name of the guest who is being interviewed in the podcast", | |
| }, | |
| "guest_organization": { | |
| "type": "string", | |
| "description": "The name or details of the organization that the podcast guest belongs to, works for or runs", | |
| }, | |
| "guest_title": { | |
| "type": "string", | |
| "description": "The title, designation or role the podcast guest holds or type of work that the podcast guest in the organization does", | |
| }, | |
| }, | |
| "required": ["guest_name"], | |
| }, | |
| } | |
| ], | |
| function_call={"name": "get_podcast_guest_information"} | |
| ) | |
| podcast_guest = "" | |
| podcast_guest_org = "" | |
| podcast_guest_title = "" | |
| response_message = completion["choices"][0]["message"] | |
| if response_message.get("function_call"): | |
| function_name = response_message["function_call"]["name"] | |
| function_args = json.loads(response_message["function_call"]["arguments"]) | |
| podcast_guest=function_args.get("guest_name") | |
| podcast_guest_org=function_args.get("guest_organization") | |
| podcast_guest_title=function_args.get("guest_title") | |
| return (podcast_guest,podcast_guest_org,podcast_guest_title) | |
| def get_podcast_highlights(podcast_transcript): | |
| instructPrompt = """ | |
| Extract some key moments in the podcast. These are typically interesting insights from the guest or critical questions that the host might have put forward. It could also be a discussion on a hot topic or controversial opinion | |
| """ | |
| request = instructPrompt + podcast_transcript | |
| chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", | |
| messages=[{"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": podcast_transcript} | |
| ] | |
| ) | |
| podcastHighlights = chatOutput.choices[0].message.content | |
| return podcastHighlights | |
| def process_podcast(url, path): | |
| '''Get podcast transcription into json''' | |
| output = {} | |
| podcast_details = get_transcribe_podcast.call(url, path) | |
| podcast_summary = get_podcast_summary.call(podcast_details) | |
| podcast_guest_details = get_podcast_guest.call(podcast_details) | |
| podcast_highlights = get_podcast_highlights.call(podcast_details) | |
| output['podcast_details'] = podcast_details | |
| output['podcast_summary'] = podcast_summary | |
| output['podcast_guest'] = podcast_guest_details[0] | |
| output['podcast_guest_org'] = podcast_guest_details[1] | |
| output['podcast_guest_title'] = podcast_guest_details[2] | |
| output['podcast_highlights'] = podcast_highlights | |
| return output |