Update README.md
Browse files
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
pipeline_tag:
|
3 |
datasets:
|
4 |
- openbmb/RLAIF-V-Dataset
|
5 |
library_name: transformers
|
@@ -13,6 +13,10 @@ tags:
|
|
13 |
- multi-image
|
14 |
- video
|
15 |
- custom_code
|
|
|
|
|
|
|
|
|
16 |
---
|
17 |
|
18 |
<h1>A GPT-4o Level MLLM for Vision, Speech and Multimodal Live Streaming on Your Phone</h1>
|
@@ -1217,7 +1221,7 @@ msgs = [{'role': 'user', 'content': [task_prompt]}] # you can also try to ask th
|
|
1217 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
1218 |
# text_prompt = f"Please read the text below."
|
1219 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
1220 |
-
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice
|
1221 |
# msgs = [sys_prompt, user_question]
|
1222 |
|
1223 |
res = model.chat(
|
@@ -1386,4 +1390,4 @@ If you find our work helpful, please consider citing our papers 📝 and liking
|
|
1386 |
journal={arXiv preprint arXiv:2408.01800},
|
1387 |
year={2024}
|
1388 |
}
|
1389 |
-
```
|
|
|
1 |
---
|
2 |
+
pipeline_tag: any-to-any
|
3 |
datasets:
|
4 |
- openbmb/RLAIF-V-Dataset
|
5 |
library_name: transformers
|
|
|
13 |
- multi-image
|
14 |
- video
|
15 |
- custom_code
|
16 |
+
- audio
|
17 |
+
- speech
|
18 |
+
- asr
|
19 |
+
- tts
|
20 |
---
|
21 |
|
22 |
<h1>A GPT-4o Level MLLM for Vision, Speech and Multimodal Live Streaming on Your Phone</h1>
|
|
|
1221 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
1222 |
# text_prompt = f"Please read the text below."
|
1223 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
1224 |
+
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Conversion)
|
1225 |
# msgs = [sys_prompt, user_question]
|
1226 |
|
1227 |
res = model.chat(
|
|
|
1390 |
journal={arXiv preprint arXiv:2408.01800},
|
1391 |
year={2024}
|
1392 |
}
|
1393 |
+
```
|