File size: 10,001 Bytes
9f8fb3c
d80df17
29b4682
1323ad0
62151c2
1323ad0
b7d5671
 
 
 
f835a2f
ac9af6a
d80df17
03de205
bccc898
 
03de205
ac9af6a
 
fed040e
d80df17
 
63538d5
 
 
6ea20bb
2e72baf
f6a33d4
03de205
6ea20bb
03de205
 
b573b6e
03de205
2e72baf
03de205
f6a33d4
0764ad7
03de205
ce646c4
03de205
 
2e72baf
d70f358
 
 
 
 
 
 
 
a5a3ff6
a736521
f4d388e
 
 
 
34172eb
f4d388e
c249a04
f4d388e
 
 
1323ad0
06fb866
 
687a46a
c78c7e3
 
69cfc54
 
b1622fb
fed040e
687a46a
fed040e
b1622fb
34172eb
 
2a8752d
821d0bc
c78c7e3
 
 
 
fd77721
 
 
 
1c75248
fd77721
34f0437
 
 
 
 
 
 
 
 
 
 
 
1c75248
 
 
 
fd77721
 
23bb982
 
 
e4c7fb6
 
 
 
 
63538d5
e4c7fb6
 
 
 
 
 
fd77721
25e5012
fd77721
6dcf7b3
 
 
ac9af6a
 
 
 
 
 
 
6dcf7b3
 
fd77721
6dcf7b3
fed040e
 
6dcf7b3
03de205
 
 
63538d5
03de205
 
 
2e72baf
6dcf7b3
ac9af6a
 
 
 
 
 
 
6dcf7b3
 
fd77721
6dcf7b3
fed040e
 
6dcf7b3
 
ac9af6a
 
 
 
 
 
 
 
 
 
 
 
fd77721
ac9af6a
fed040e
 
ac9af6a
6dcf7b3
9ddc7f6
b7d5671
1c75248
ac9af6a
 
69cfc54
 
 
5ce404d
b7d5671
1c75248
a5a3ff6
1c75248
 
 
 
 
 
 
b1622fb
1c75248
b7d5671
1c75248
 
34f0437
fd77721
15b739a
34f0437
1c75248
88ddd74
13f2800
1c75248
 
b7d5671
fd77721
34f0437
 
 
 
fd77721
aabaa7c
 
1c75248
a4e384d
a2f4706
790c751
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import spaces

import torch
import gradio as gr
from transformers import pipeline, AutoModel, LlamaTokenizer, LlamaForCausalLM, InstructBlipForConditionalGeneration, InstructBlipProcessor
import numpy as np
#import yaml
#import os
import requests
import nltk
import scipy.io.wavfile
import os
import subprocess
from huggingface_hub import hf_hub_download

subprocess.run(['bash','llama.sh'])
from llama_cpp import Llama

os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")

from espnet2.bin.tts_inference import Text2Speech

repo_id = "Sosaka/Vicuna-7B-4bit-ggml"
filename = "vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin"
cache_dir="~/.cache/huggingface/hub"
#hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
'''
llm = Llama(
      model_path="~/.cache/huggingface/hub/vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
      n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      n_ctx=4096, # Uncomment to increase the context window
)

llm = Llama.from_pretrained(
    repo_id="Sosaka/Vicuna-7B-4bit-ggml",
    filename="vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    n_ctx = 4096,
    verbose=False
)
'''
try:
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
    nltk.download('averaged_perceptron_tagger_eng')
try:
    nltk.data.find('corpora/cmudict')  # Check for cmudict
except LookupError:
    nltk.download('cmudict')

ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    chunk_length_s=30,
    device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
)

all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]

def _preload_and_load_models():
    global vicuna_tokenizer, vicuna_model
    #VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
    VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5"  # Or another model
    #VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Or another model
    vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
    vicuna_model = LlamaForCausalLM.from_pretrained(
        VICUNA_MODEL_NAME,
        #torch_dtype=torch.float16,
     #   device_map="auto", # or.to('cuda')
    ).to('cuda',torch.float16) # Explicitly move to CUDA after loading

_preload_and_load_models()

tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')

model5 = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to('cuda',torch.bfloat16)

processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

cap_prompt = (
            "Describe this image with a caption to be used for question answering."
)

@spaces.GPU(required=True)
def process_audio(img, microphone, audio_upload, state, answer_mode):  # Added audio_upload
    audio_source = None
    if microphone:
        audio_source = microphone
        asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
        text = asr_pipe(audio_source)["text"]
    elif audio_upload:
        audio_source = audio_upload
        rate, data = scipy.io.wavfile.read(audio_source)
        asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
        text = asr_pipe(data)["text"]
    else:
        return state, state, None  # No audio input
    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
        You answer questions clearly and simply, using age-appropriate language.
        You are also a little bit silly and like to make jokes."""
    prompt = f"{system_prompt}\nUser: {text}"
    
    if img is not None:
        sd_image_a = Image.open(img.name).convert('RGB')
        inputsa = processor5(images=sd_image_a, text=cap_prompt, return_tensors="pt").to('cuda')
        sd_image_a.resize((512,512), Image.LANCZOS)
        with torch.no_grad():
            generated_ids = model5.generate(
            **inputsa,
            do_sample=True,
            num_beams=1,
            max_length=96,
            min_length=64,
            top_p=0.9,
            repetition_penalty=1.0,
            length_penalty=2.0,
            temperature=0.5,
            )
        generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        print(generated_text)
        prompt =  f"{system_prompt}\nImage: {generated_text}\nUser: {text}"
    with torch.no_grad():
        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
        if answer_mode == 'slow':
            torch.backends.cuda.matmul.allow_tf32 = False
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
            torch.backends.cudnn.allow_tf32 = False
            torch.backends.cudnn.deterministic = False
            torch.backends.cudnn.benchmark = True
            torch.set_float32_matmul_precision("highest")
            vicuna_output = vicuna_model.generate(
                **vicuna_input,
                max_new_tokens = 512,
                min_new_tokens = 256,
                do_sample = True,
                low_memory = False
            )
            '''
            vicuna_output = llm(
                **vicuna_input,
                max_tokens=96, # Generate up to 32 tokens, set to None to generate up to the end of the context window
                stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
                echo=True # Echo the prompt back in the output
            )
            '''
        if answer_mode == 'medium':
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cudnn.deterministic = False
            torch.backends.cudnn.benchmark = False
            torch.set_float32_matmul_precision("high")
            vicuna_output = vicuna_model.generate(
                **vicuna_input,
                max_length = 192,
                min_new_tokens = 64,
                do_sample = True,
                low_memory = False
            )
        if answer_mode == 'fast':
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            # torch.backends.cuda.preferred_blas_library="cublas"
            # torch.backends.cuda.preferred_linalg_library="cusolver"
            torch.set_float32_matmul_precision("medium")
            with torch.no_grad():
                vicuna_output = vicuna_model.generate(
                    **vicuna_input,
                    #max_new_tokens = 64,
                    min_new_tokens = 16,
                    do_sample = True,
                    low_memory = True
                )
        vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
    vicuna_response = vicuna_response.replace(prompt, "").strip()
    updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
    try:
        with torch.no_grad():
            output = tts(vicuna_response)
        wav = output["wav"]
        sr = tts.fs
        audio_arr = wav.cpu().numpy()
        SAMPLE_RATE = sr
        audio_arr = audio_arr / np.abs(audio_arr).max()
        audio_output = (SAMPLE_RATE, audio_arr)
        #sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file
    except requests.exceptions.RequestException as e:
        print(f"Error in Hugging Face API request: {e}")
        audio_output = None
    except Exception as e:
        print(f"Error in speech synthesis: {e}")
        audio_output = None
    return updated_state, updated_state, audio_output

with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo:  # Updated title
    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
    with gr.Tab("Transcribe & Synthesize"):
        with gr.Row(): # Added a row for better layout
            image = gr.File(label="Image Prompt (Optional)")
            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio")
            audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
        audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
        answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
        transcription_state = gr.State(value="")
        mic_input.change(
            fn=process_audio,
            inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
            outputs=[transcription_output, transcription_state, audio_output]
        )
        audio_upload.change( # Added change event for upload
            fn=process_audio,
            inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
            outputs=[transcription_output, transcription_state, audio_output],
            api_name='/api/predict'
        )
        
if __name__ == '__main__':
    demo.launch(share=False)