update readme
Browse files
README.md
CHANGED
@@ -1101,6 +1101,7 @@ else:
|
|
1101 |
|
1102 |
### Audio-Only mode
|
1103 |
#### Mimick
|
|
|
1104 |
```python
|
1105 |
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
|
1106 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
@@ -1124,16 +1125,18 @@ res = model.chat(
|
|
1124 |
```python
|
1125 |
ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
|
1126 |
|
1127 |
-
#
|
1128 |
-
|
1129 |
-
|
|
|
1130 |
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
```
|
1135 |
```python
|
1136 |
msgs = [sys_prompt, user_question]
|
|
|
1137 |
res = model.chat(
|
1138 |
msgs=msgs,
|
1139 |
tokenizer=tokenizer,
|
@@ -1179,7 +1182,7 @@ General Audio:
|
|
1179 |
Audio Caption: Summarize the main content of the audio.
|
1180 |
Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
|
1181 |
'''
|
1182 |
-
task_prompt = "
|
1183 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
1184 |
|
1185 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
@@ -1204,19 +1207,19 @@ Speech Generation Task Prompt:
|
|
1204 |
# 在新闻中,一个年轻男性兴致勃勃地说:“祝福亲爱的祖国母亲美丽富强!”他用低音调和低音量,慢慢地说出了这句话。
|
1205 |
# Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
|
1206 |
|
1207 |
-
Voice Cloning or Voice
|
1208 |
'''
|
1209 |
# Human Instruction-to-Speech:
|
1210 |
-
task_prompt = '' #Try to make some Human Instruction-to-Speech prompt
|
1211 |
-
msgs = [{'role': 'user', 'content': [task_prompt]}] # you can try to
|
1212 |
|
1213 |
-
# Voice Cloning mode:
|
1214 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
1215 |
# text_prompt = f"Please read the text below."
|
1216 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
1217 |
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
|
|
|
1218 |
|
1219 |
-
msgs = [sys_prompt, user_question]
|
1220 |
res = model.chat(
|
1221 |
msgs=msgs,
|
1222 |
tokenizer=tokenizer,
|
|
|
1101 |
|
1102 |
### Audio-Only mode
|
1103 |
#### Mimick
|
1104 |
+
- In this task, you can see the models end-to-end ability. MiniCPM-o 2.6 takes an audio input and produces both an automatic speech recognition (ASR) transcription and a voice imitation (TTS) output.
|
1105 |
```python
|
1106 |
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
|
1107 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
|
|
1125 |
```python
|
1126 |
ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
|
1127 |
|
1128 |
+
# Choose the mode you want to use
|
1129 |
+
# Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt. (More human-like conversation but unstable)
|
1130 |
+
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
1131 |
+
# user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
|
1132 |
|
1133 |
+
Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant. (Stable and more suitable for general conversation)
|
1134 |
+
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
|
1135 |
+
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something by recording it in 'xxx.wav'!!!
|
1136 |
```
|
1137 |
```python
|
1138 |
msgs = [sys_prompt, user_question]
|
1139 |
+
# round one
|
1140 |
res = model.chat(
|
1141 |
msgs=msgs,
|
1142 |
tokenizer=tokenizer,
|
|
|
1182 |
Audio Caption: Summarize the main content of the audio.
|
1183 |
Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
|
1184 |
'''
|
1185 |
+
task_prompt = "" # Choose the task prompt above
|
1186 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
1187 |
|
1188 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
|
|
1207 |
# 在新闻中,一个年轻男性兴致勃勃地说:“祝福亲爱的祖国母亲美丽富强!”他用低音调和低音量,慢慢地说出了这句话。
|
1208 |
# Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
|
1209 |
|
1210 |
+
Voice Cloning or Voice Conversion: With this mode, model will act like a TTS model.
|
1211 |
'''
|
1212 |
# Human Instruction-to-Speech:
|
1213 |
+
task_prompt = '' #Try to make some Human Instruction-to-Speech prompt (Voice Creation)
|
1214 |
+
msgs = [{'role': 'user', 'content': [task_prompt]}] # you can also try to ask the same audio question
|
1215 |
|
1216 |
+
# Voice Cloning mode:
|
1217 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
1218 |
# text_prompt = f"Please read the text below."
|
1219 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
1220 |
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
|
1221 |
+
# msgs = [sys_prompt, user_question]
|
1222 |
|
|
|
1223 |
res = model.chat(
|
1224 |
msgs=msgs,
|
1225 |
tokenizer=tokenizer,
|