Cuiunbo commited on
Commit
640e05c
·
1 Parent(s): 0c9ab77

update readme

Browse files
Files changed (1) hide show
  1. README.md +15 -12
README.md CHANGED
@@ -1101,6 +1101,7 @@ else:
1101
 
1102
  ### Audio-Only mode
1103
  #### Mimick
 
1104
  ```python
1105
  mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
1106
  audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
@@ -1124,16 +1125,18 @@ res = model.chat(
1124
  ```python
1125
  ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
1126
 
1127
- # Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt.
1128
- sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
1129
- user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
 
1130
 
1131
- # Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant.
1132
- # sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
1133
- # user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something!
1134
  ```
1135
  ```python
1136
  msgs = [sys_prompt, user_question]
 
1137
  res = model.chat(
1138
  msgs=msgs,
1139
  tokenizer=tokenizer,
@@ -1179,7 +1182,7 @@ General Audio:
1179
  Audio Caption: Summarize the main content of the audio.
1180
  Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
1181
  '''
1182
- task_prompt = "\n"
1183
  audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
1184
 
1185
  msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
@@ -1204,19 +1207,19 @@ Speech Generation Task Prompt:
1204
  # 在新闻中,一个年轻男性兴致勃勃地说:“祝福亲爱的祖国母亲美丽富强!”他用低音调和低音量,慢慢地说出了这句话。
1205
  # Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
1206
 
1207
- Voice Cloning or Voice Creation: With this mode, model will act like a TTS model.
1208
  '''
1209
  # Human Instruction-to-Speech:
1210
- task_prompt = '' #Try to make some Human Instruction-to-Speech prompt
1211
- msgs = [{'role': 'user', 'content': [task_prompt]}] # you can try to use the same audio question
1212
 
1213
- # Voice Cloning mode: With this mode, model will act like a TTS model.
1214
  # sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
1215
  # text_prompt = f"Please read the text below."
1216
  # user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
1217
  # user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
 
1218
 
1219
- msgs = [sys_prompt, user_question]
1220
  res = model.chat(
1221
  msgs=msgs,
1222
  tokenizer=tokenizer,
 
1101
 
1102
  ### Audio-Only mode
1103
  #### Mimick
1104
+ - In this task, you can see the models end-to-end ability. MiniCPM-o 2.6 takes an audio input and produces both an automatic speech recognition (ASR) transcription and a voice imitation (TTS) output.
1105
  ```python
1106
  mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
1107
  audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
 
1125
  ```python
1126
  ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
1127
 
1128
+ # Choose the mode you want to use
1129
+ # Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt. (More human-like conversation but unstable)
1130
+ # sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
1131
+ # user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
1132
 
1133
+ Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant. (Stable and more suitable for general conversation)
1134
+ sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
1135
+ user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something by recording it in 'xxx.wav'!!!
1136
  ```
1137
  ```python
1138
  msgs = [sys_prompt, user_question]
1139
+ # round one
1140
  res = model.chat(
1141
  msgs=msgs,
1142
  tokenizer=tokenizer,
 
1182
  Audio Caption: Summarize the main content of the audio.
1183
  Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
1184
  '''
1185
+ task_prompt = "" # Choose the task prompt above
1186
  audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
1187
 
1188
  msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
 
1207
  # 在新闻中,一个年轻男性兴致勃勃地说:“祝福亲爱的祖国母亲美丽富强!”他用低音调和低音量,慢慢地说出了这句话。
1208
  # Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
1209
 
1210
+ Voice Cloning or Voice Conversion: With this mode, model will act like a TTS model.
1211
  '''
1212
  # Human Instruction-to-Speech:
1213
+ task_prompt = '' #Try to make some Human Instruction-to-Speech prompt (Voice Creation)
1214
+ msgs = [{'role': 'user', 'content': [task_prompt]}] # you can also try to ask the same audio question
1215
 
1216
+ # Voice Cloning mode:
1217
  # sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
1218
  # text_prompt = f"Please read the text below."
1219
  # user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
1220
  # user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
1221
+ # msgs = [sys_prompt, user_question]
1222
 
 
1223
  res = model.chat(
1224
  msgs=msgs,
1225
  tokenizer=tokenizer,