Spaces:

Audio-AGI
/

WavJourney

Running on A10G

App Files Files Community

zzk1st commited on Aug 22, 2023

Commit

5bd33c2

1 Parent(s): 1e9b2c6

Multiple user API keys

Browse files

Files changed (6) hide show

config.yaml +1 -1
pipeline.py +34 -32
share_btn.py +2 -2
ui_client.py +12 -8
utils.py +1 -2
wavjourney_cli.py +4 -1

config.yaml CHANGED Viewed

@@ -18,4 +18,4 @@ Voice-Parser:
 Service-Port: 5000
-OpenAI-Key: ''


18
19	Service-Port: 5000
20
21	+ OpenAI-Key: ''

pipeline.py CHANGED Viewed

@@ -4,7 +4,6 @@ from string import Template
 import openai
 import re
 import glob
-from utils import get_key
 import pickle
 import time
 import json5
@@ -26,28 +25,33 @@ if USE_OPENAI_CACHE:
         with open(cache_file, 'rb') as file:
             openai_cache.append(pickle.load(file))
-openai.api_key = get_key()
-def chat_with_gpt(prompt):
     if USE_OPENAI_CACHE:
         filtered_object = list(filter(lambda x: x['prompt'] == prompt, openai_cache))
         if len(filtered_object) > 0:
             response = filtered_object[0]['response']
             return response
-    chat = openai.ChatCompletion.create(
-        # model="gpt-3.5-turbo",
-        model="gpt-4",
-        messages=[
-            {
-                "role": "system",
-                "content": "You are a helpful assistant."
-            },
-            {
-                "role": "user",
-                "content": prompt
-            }
-        ]
-    )
     if USE_OPENAI_CACHE:
         cache_obj = {
             'prompt': prompt,
@@ -120,10 +124,10 @@ def init_session(session_id=''):
     return session_id
 @retry(stop_max_attempt_number=3)
-def input_text_to_json_script_with_retry(complete_prompt_path):
     print("    trying ...")
     complete_prompt = get_file_content(complete_prompt_path)
-    json_response = try_extract_content_from_quotes(chat_with_gpt(complete_prompt))
     json_data = json5.loads(json_response)
     try:
@@ -138,22 +142,20 @@ def input_text_to_json_script_with_retry(complete_prompt_path):
     return json_response
 # Step 1: input_text to json
-def input_text_to_json_script(input_text, output_path):
     print('Step 1: Writing audio script with LLM ...')
     input_text = maybe_get_content_from_file(input_text)
     text_to_audio_script_prompt = get_file_content('prompts/text_to_json.prompt')
     prompt = f'{text_to_audio_script_prompt}\n\nInput text: {input_text}\n\nScript:\n'
     complete_prompt_path = output_path / 'complete_input_text_to_audio_script.prompt'
     write_to_file(complete_prompt_path, prompt)
-    audio_script_response = input_text_to_json_script_with_retry(complete_prompt_path)
     generated_audio_script_filename = output_path / 'audio_script.json'
     write_to_file(generated_audio_script_filename, audio_script_response)
     return audio_script_response
 # Step 2: json to char-voice map
-def json_script_to_char_voice_map(json_script, voices, output_path):
-    def create_complete_char_voice_map(char_voice_map):
-        return
     print('Step 2: Parsing character voice with LLM...')
     json_script_content = maybe_get_content_from_file(json_script)
     prompt = get_file_content('prompts/audio_script_to_character_voice_map.prompt')
@@ -161,7 +163,7 @@ def json_script_to_char_voice_map(json_script, voices, output_path):
     prompt = Template(prompt).substitute(voice_and_desc=presets_str)
     prompt = f"{prompt}\n\nAudio script:\n'''\n{json_script_content}\n'''\n\noutput:\n"
     write_to_file(output_path / 'complete_audio_script_to_char_voice_map.prompt', prompt)
-    char_voice_map_response = try_extract_content_from_quotes(chat_with_gpt(prompt))
     char_voice_map = json5.loads(char_voice_map_response)
     # enrich char_voice_map with voice preset metadata
     complete_char_voice_map = {c: voices[char_voice_map[c]] for c in char_voice_map}
@@ -188,19 +190,19 @@ def audio_code_gen_to_result(audio_gen_code_path):
     os.system(f'python {audio_gen_code_filename}')
 # Function call used by Gradio: input_text to json
-def generate_json_file(session_id, input_text):
     output_path = utils.get_session_path(session_id)
     # Step 1
-    return input_text_to_json_script(input_text, output_path)
 # Function call used by Gradio: json to result wav
-def generate_audio(session_id, json_script):
     output_path = utils.get_session_path(session_id)
     output_audio_path = utils.get_session_audio_path(session_id)
     voices = voice_presets.get_merged_voice_presets(session_id)
     # Step 2
-    char_voice_map = json_script_to_char_voice_map(json_script, voices, output_path)
     # Step 3
     json_script_filename = output_path / 'audio_script.json'
     char_voice_map_filename = output_path / 'character_voice_map.json'
@@ -214,6 +216,6 @@ def generate_audio(session_id, json_script):
     return result_wav_filename, char_voice_map
 # Convenient function call used by wavjourney_cli
-def full_steps(session_id, input_text):
-    json_script = generate_json_file(session_id, input_text)
-    return generate_audio(session_id, json_script)

 import openai
 import re
 import glob
 import pickle
 import time
 import json5
         with open(cache_file, 'rb') as file:
             openai_cache.append(pickle.load(file))
+def chat_with_gpt(prompt, api_key):
     if USE_OPENAI_CACHE:
         filtered_object = list(filter(lambda x: x['prompt'] == prompt, openai_cache))
         if len(filtered_object) > 0:
             response = filtered_object[0]['response']
             return response
+    try:
+        openai.api_key = api_key
+        chat = openai.ChatCompletion.create(
+            # model="gpt-3.5-turbo",
+            model="gpt-4",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ]
+        )
+    finally:
+        openai.api_key = ''
     if USE_OPENAI_CACHE:
         cache_obj = {
             'prompt': prompt,
     return session_id
 @retry(stop_max_attempt_number=3)
+def input_text_to_json_script_with_retry(complete_prompt_path, api_key):
     print("    trying ...")
     complete_prompt = get_file_content(complete_prompt_path)
+    json_response = try_extract_content_from_quotes(chat_with_gpt(complete_prompt, api_key))
     json_data = json5.loads(json_response)
     try:
     return json_response
 # Step 1: input_text to json
+def input_text_to_json_script(input_text, output_path, api_key):
     print('Step 1: Writing audio script with LLM ...')
     input_text = maybe_get_content_from_file(input_text)
     text_to_audio_script_prompt = get_file_content('prompts/text_to_json.prompt')
     prompt = f'{text_to_audio_script_prompt}\n\nInput text: {input_text}\n\nScript:\n'
     complete_prompt_path = output_path / 'complete_input_text_to_audio_script.prompt'
     write_to_file(complete_prompt_path, prompt)
+    audio_script_response = input_text_to_json_script_with_retry(complete_prompt_path, api_key)
     generated_audio_script_filename = output_path / 'audio_script.json'
     write_to_file(generated_audio_script_filename, audio_script_response)
     return audio_script_response
 # Step 2: json to char-voice map
+def json_script_to_char_voice_map(json_script, voices, output_path, api_key):
     print('Step 2: Parsing character voice with LLM...')
     json_script_content = maybe_get_content_from_file(json_script)
     prompt = get_file_content('prompts/audio_script_to_character_voice_map.prompt')
     prompt = Template(prompt).substitute(voice_and_desc=presets_str)
     prompt = f"{prompt}\n\nAudio script:\n'''\n{json_script_content}\n'''\n\noutput:\n"
     write_to_file(output_path / 'complete_audio_script_to_char_voice_map.prompt', prompt)
+    char_voice_map_response = try_extract_content_from_quotes(chat_with_gpt(prompt, api_key))
     char_voice_map = json5.loads(char_voice_map_response)
     # enrich char_voice_map with voice preset metadata
     complete_char_voice_map = {c: voices[char_voice_map[c]] for c in char_voice_map}
     os.system(f'python {audio_gen_code_filename}')
 # Function call used by Gradio: input_text to json
+def generate_json_file(session_id, input_text, api_key):
     output_path = utils.get_session_path(session_id)
     # Step 1
+    return input_text_to_json_script(input_text, output_path, api_key)
 # Function call used by Gradio: json to result wav
+def generate_audio(session_id, json_script, api_key):
     output_path = utils.get_session_path(session_id)
     output_audio_path = utils.get_session_audio_path(session_id)
     voices = voice_presets.get_merged_voice_presets(session_id)
     # Step 2
+    char_voice_map = json_script_to_char_voice_map(json_script, voices, output_path, api_key)
     # Step 3
     json_script_filename = output_path / 'audio_script.json'
     char_voice_map_filename = output_path / 'character_voice_map.json'
     return result_wav_filename, char_voice_map
 # Convenient function call used by wavjourney_cli
+def full_steps(session_id, input_text, api_key):
+    json_script = generate_json_file(session_id, input_text, api_key)
+    return generate_audio(session_id, json_script, api_key)

share_btn.py CHANGED Viewed

@@ -26,7 +26,7 @@ share_js = """async () => {
         const res = await fetch(videoEl.src);
         const blob = await res.blob();
         const videoId = Date.now() % 200;
-        const fileName = `sd-perception-${{videoId}}.mp4`;
         return new File([blob], fileName, { type: 'video/mp4' });
 	}
@@ -40,7 +40,7 @@ share_js = """async () => {
         });
       }
     const gradioEl = document.querySelector("gradio-app").shadowRoot || document.querySelector('body > gradio-app');
-    const inputPromptEl = gradioEl.querySelector('#prompt-in input').value;
     const outputVideoEl = gradioEl.querySelector('#output-video video');
     let titleTxt = `WavJourney: ${inputPromptEl}`;

         const res = await fetch(videoEl.src);
         const blob = await res.blob();
         const videoId = Date.now() % 200;
+        const fileName = `sd-perception-${videoId}.mp4`;
         return new File([blob], fileName, { type: 'video/mp4' });
 	}
         });
       }
     const gradioEl = document.querySelector("gradio-app").shadowRoot || document.querySelector('body > gradio-app');
+    const inputPromptEl = gradioEl.querySelector('#prompt-in textarea').value;
     const outputVideoEl = gradioEl.querySelector('#output-video video');
     let titleTxt = `WavJourney: ${inputPromptEl}`;

ui_client.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import shutil
 import json5
-import openai
 import gradio as gr
 from tabulate import tabulate
@@ -44,11 +44,13 @@ def convert_char_voice_map_to_md(char_voice_map):
 def generate_script_fn(instruction, _state: gr.State):
     try:
         session_id = _state['session_id']
-        json_script = generate_json_file(session_id, instruction)
         table_text = convert_json_to_md(json_script)
     except Exception as e:
         gr.Warning(str(e))
         print(f"Generating script error: {str(e)}")
         return [
             None,
             _state,
@@ -89,9 +91,8 @@ def generate_audio_fn(state):
         ]
     except Exception as e:
         print(f"Generation audio error: {str(e)}")
         gr.Warning(str(e))
-        # For debugging, uncomment the line below
-        #raise e
     return [
         None,
@@ -172,8 +173,8 @@ def get_system_voice_presets():
     return data
-def set_openai_key(key):
-    openai.api_key = key
     return key
@@ -191,7 +192,10 @@ def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset):
             add_session_voice_preset(vp_id, vp_desc, file_path, session_id)
             added_voice_preset['count'] = count + 1
         except Exception as exception:
             gr.Warning(str(exception))
     # After added
     dataframe = get_voice_preset_to_list(ui_state)
     df_visible = gr.Dataframe.update(visible=True)
@@ -379,7 +383,7 @@ with gr.Blocks(css=css) as interface:
     system_voice_presets = get_system_voice_presets()
     # State
-    ui_state = gr.State(value={'session_id': pipeline.init_session()})
     selected_voice_presets = gr.State(value={'selected_voice_preset': None})
     added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
     # UI Component
@@ -461,7 +465,7 @@ with gr.Blocks(css=css) as interface:
     )
     # events
-    key_text_input.change(fn=set_openai_key, inputs=[key_text_input], outputs=[key_text_input])
     text_input.change(fn=textbox_listener, inputs=[text_input], outputs=[generate_script_btn])
     generate_audio_btn.click(
         fn=generate_audio_fn,

 import shutil
 import json5
+import traceback
 import gradio as gr
 from tabulate import tabulate
 def generate_script_fn(instruction, _state: gr.State):
     try:
         session_id = _state['session_id']
+        api_key = _state['api_key']
+        json_script = generate_json_file(session_id, instruction, api_key)
         table_text = convert_json_to_md(json_script)
     except Exception as e:
         gr.Warning(str(e))
         print(f"Generating script error: {str(e)}")
+        traceback.print_exc()
         return [
             None,
             _state,
         ]
     except Exception as e:
         print(f"Generation audio error: {str(e)}")
+        traceback.print_exc()
         gr.Warning(str(e))
     return [
         None,
     return data
+def set_openai_key(key, _state):
+    _state['api_key'] = key
     return key
             add_session_voice_preset(vp_id, vp_desc, file_path, session_id)
             added_voice_preset['count'] = count + 1
         except Exception as exception:
+            print(exception)
+            traceback.print_exc()
             gr.Warning(str(exception))
     # After added
     dataframe = get_voice_preset_to_list(ui_state)
     df_visible = gr.Dataframe.update(visible=True)
     system_voice_presets = get_system_voice_presets()
     # State
+    ui_state = gr.State(value={'session_id': pipeline.init_session(), 'api_key': ''})
     selected_voice_presets = gr.State(value={'selected_voice_preset': None})
     added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
     # UI Component
     )
     # events
+    key_text_input.change(fn=set_openai_key, inputs=[key_text_input, ui_state], outputs=[key_text_input])
     text_input.change(fn=textbox_listener, inputs=[text_input], outputs=[generate_script_btn])
     generate_audio_btn.click(
         fn=generate_audio_fn,

utils.py CHANGED Viewed

@@ -62,6 +62,5 @@ def fade(audio_data, fade_duration=2, sr=32000):
 def get_key(config='config.yaml'):
     with open('config.yaml', 'r') as file:
         config = yaml.safe_load(file)
-        openai_key = config['OpenAI-Key']
-    return openai_key

 def get_key(config='config.yaml'):
     with open('config.yaml', 'r') as file:
         config = yaml.safe_load(file)
+        return config['OpenAI-Key'] if 'OpenAI-Key' in config else None

wavjourney_cli.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import time
 import argparse
 import pipeline
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--full', action='store_true', help='Go through the full process')
 parser.add_argument('--input-text', type=str, default='', help='input text or text file')
 parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id')
 args = parser.parse_args()
 if args.full:
@@ -14,10 +16,11 @@ if args.full:
     start_time = time.time()
     session_id = pipeline.init_session(args.session_id)
     print(f"Session {session_id} is created.")
-    pipeline.full_steps(session_id, input_text)
     end_time = time.time()
     print(f"WavJourney took {end_time - start_time:.2f} seconds to complete.")

 import time
 import argparse
+import utils
 import pipeline
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--full', action='store_true', help='Go through the full process')
 parser.add_argument('--input-text', type=str, default='', help='input text or text file')
 parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id')
+parser.add_argument('--api-key', type=str, default='', help='api key used for GPT-4')
 args = parser.parse_args()
 if args.full:
     start_time = time.time()
     session_id = pipeline.init_session(args.session_id)
+    api_key = args.api_key if args.api_key != '' else utils.get_key()
     print(f"Session {session_id} is created.")
+    pipeline.full_steps(session_id, input_text, api_key)
     end_time = time.time()
     print(f"WavJourney took {end_time - start_time:.2f} seconds to complete.")