openbmb
/

MiniCPM-o-2_6

Model card Files Files and versions Community

Cuiunbo commited on Jan 13

Commit

640e05c

1 Parent(s): 0c9ab77

update readme

Browse files

Files changed (1) hide show

README.md +15 -12

README.md CHANGED Viewed

@@ -1101,6 +1101,7 @@ else:
 ### Audio-Only mode
 #### Mimick
 ```python
 mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
 audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
@@ -1124,16 +1125,18 @@ res = model.chat(
 ```python
 ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
-# Audio RolePlay:  # With this mode, model will role-play the character based on the audio prompt.
-sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
-user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
-# Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant.
-# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
-# user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something!
 ```
 ```python
 msgs = [sys_prompt, user_question]
 res = model.chat(
     msgs=msgs,
     tokenizer=tokenizer,
@@ -1179,7 +1182,7 @@ General Audio:
     Audio Caption: Summarize the main content of the audio.
     Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
 '''
-task_prompt = "\n"
 audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
 msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
@@ -1204,19 +1207,19 @@ Speech Generation Task Prompt:
         # 在新闻中，一个年轻男性兴致勃勃地说：“祝福亲爱的祖国母亲美丽富强！”他用低音调和低音量，慢慢地说出了这句话。
         # Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
-    Voice Cloning or Voice Creation: With this mode, model will act like a TTS model.
 '''
 # Human Instruction-to-Speech:
-task_prompt = '' #Try to make some Human Instruction-to-Speech prompt
-msgs = [{'role': 'user', 'content': [task_prompt]}] # you can try to use the same audio question
-# Voice Cloning mode: With this mode, model will act like a TTS model.
 # sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
 # text_prompt = f"Please read the text below."
 # user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
 # user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
-msgs = [sys_prompt, user_question]
 res = model.chat(
     msgs=msgs,
     tokenizer=tokenizer,

 ### Audio-Only mode
 #### Mimick
+- In this task, you can see the models end-to-end  ability. MiniCPM-o 2.6 takes an audio input and produces both an automatic speech recognition (ASR) transcription and a voice imitation (TTS) output.
 ```python
 mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
 audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
 ```python
 ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
+# Choose the mode you want to use
+# Audio RolePlay:  # With this mode, model will role-play the character based on the audio prompt. (More human-like conversation but unstable)
+# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
+# user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
+Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant. (Stable and more suitable for general conversation)
+sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
+user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something by recording it in 'xxx.wav'!!!
 ```
 ```python
 msgs = [sys_prompt, user_question]
+# round one
 res = model.chat(
     msgs=msgs,
     tokenizer=tokenizer,
     Audio Caption: Summarize the main content of the audio.
     Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
 '''
+task_prompt = "" # Choose the task prompt above
 audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
 msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
         # 在新闻中，一个年轻男性兴致勃勃地说：“祝福亲爱的祖国母亲美丽富强！”他用低音调和低音量，慢慢地说出了这句话。
         # Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
+    Voice Cloning or Voice Conversion: With this mode, model will act like a TTS model.
 '''
 # Human Instruction-to-Speech:
+task_prompt = '' #Try to make some Human Instruction-to-Speech prompt (Voice Creation)
+msgs = [{'role': 'user', 'content': [task_prompt]}] # you can also try to ask the same audio question
+# Voice Cloning mode:
 # sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
 # text_prompt = f"Please read the text below."
 # user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
 # user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
+# msgs = [sys_prompt, user_question]
 res = model.chat(
     msgs=msgs,
     tokenizer=tokenizer,