Spaces:
Running
on
Zero
Running
on
Zero
File size: 10,001 Bytes
9f8fb3c d80df17 29b4682 1323ad0 62151c2 1323ad0 b7d5671 f835a2f ac9af6a d80df17 03de205 bccc898 03de205 ac9af6a fed040e d80df17 63538d5 6ea20bb 2e72baf f6a33d4 03de205 6ea20bb 03de205 b573b6e 03de205 2e72baf 03de205 f6a33d4 0764ad7 03de205 ce646c4 03de205 2e72baf d70f358 a5a3ff6 a736521 f4d388e 34172eb f4d388e c249a04 f4d388e 1323ad0 06fb866 687a46a c78c7e3 69cfc54 b1622fb fed040e 687a46a fed040e b1622fb 34172eb 2a8752d 821d0bc c78c7e3 fd77721 1c75248 fd77721 34f0437 1c75248 fd77721 23bb982 e4c7fb6 63538d5 e4c7fb6 fd77721 25e5012 fd77721 6dcf7b3 ac9af6a 6dcf7b3 fd77721 6dcf7b3 fed040e 6dcf7b3 03de205 63538d5 03de205 2e72baf 6dcf7b3 ac9af6a 6dcf7b3 fd77721 6dcf7b3 fed040e 6dcf7b3 ac9af6a fd77721 ac9af6a fed040e ac9af6a 6dcf7b3 9ddc7f6 b7d5671 1c75248 ac9af6a 69cfc54 5ce404d b7d5671 1c75248 a5a3ff6 1c75248 b1622fb 1c75248 b7d5671 1c75248 34f0437 fd77721 15b739a 34f0437 1c75248 88ddd74 13f2800 1c75248 b7d5671 fd77721 34f0437 fd77721 aabaa7c 1c75248 a4e384d a2f4706 790c751 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
import spaces
import torch
import gradio as gr
from transformers import pipeline, AutoModel, LlamaTokenizer, LlamaForCausalLM, InstructBlipForConditionalGeneration, InstructBlipProcessor
import numpy as np
#import yaml
#import os
import requests
import nltk
import scipy.io.wavfile
import os
import subprocess
from huggingface_hub import hf_hub_download
subprocess.run(['bash','llama.sh'])
from llama_cpp import Llama
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
from espnet2.bin.tts_inference import Text2Speech
repo_id = "Sosaka/Vicuna-7B-4bit-ggml"
filename = "vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin"
cache_dir="~/.cache/huggingface/hub"
#hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
'''
llm = Llama(
model_path="~/.cache/huggingface/hub/vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
n_gpu_layers=-1, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
n_ctx=4096, # Uncomment to increase the context window
)
llm = Llama.from_pretrained(
repo_id="Sosaka/Vicuna-7B-4bit-ggml",
filename="vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin",
n_gpu_layers=-1, # Uncomment to use GPU acceleration
n_ctx = 4096,
verbose=False
)
'''
try:
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
nltk.download('averaged_perceptron_tagger_eng')
try:
nltk.data.find('corpora/cmudict') # Check for cmudict
except LookupError:
nltk.download('cmudict')
ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
chunk_length_s=30,
device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
)
all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]
def _preload_and_load_models():
global vicuna_tokenizer, vicuna_model
#VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5" # Or another model
#VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5" # Or another model
vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
vicuna_model = LlamaForCausalLM.from_pretrained(
VICUNA_MODEL_NAME,
#torch_dtype=torch.float16,
# device_map="auto", # or.to('cuda')
).to('cuda',torch.float16) # Explicitly move to CUDA after loading
_preload_and_load_models()
tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
model5 = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to('cuda',torch.bfloat16)
processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
cap_prompt = (
"Describe this image with a caption to be used for question answering."
)
@spaces.GPU(required=True)
def process_audio(img, microphone, audio_upload, state, answer_mode): # Added audio_upload
audio_source = None
if microphone:
audio_source = microphone
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(audio_source)["text"]
elif audio_upload:
audio_source = audio_upload
rate, data = scipy.io.wavfile.read(audio_source)
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(data)["text"]
else:
return state, state, None # No audio input
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
You answer questions clearly and simply, using age-appropriate language.
You are also a little bit silly and like to make jokes."""
prompt = f"{system_prompt}\nUser: {text}"
if img is not None:
sd_image_a = Image.open(img.name).convert('RGB')
inputsa = processor5(images=sd_image_a, text=cap_prompt, return_tensors="pt").to('cuda')
sd_image_a.resize((512,512), Image.LANCZOS)
with torch.no_grad():
generated_ids = model5.generate(
**inputsa,
do_sample=True,
num_beams=1,
max_length=96,
min_length=64,
top_p=0.9,
repetition_penalty=1.0,
length_penalty=2.0,
temperature=0.5,
)
generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)
prompt = f"{system_prompt}\nImage: {generated_text}\nUser: {text}"
with torch.no_grad():
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
if answer_mode == 'slow':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_new_tokens = 512,
min_new_tokens = 256,
do_sample = True,
low_memory = False
)
'''
vicuna_output = llm(
**vicuna_input,
max_tokens=96, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output
)
'''
if answer_mode == 'medium':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.set_float32_matmul_precision("high")
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_length = 192,
min_new_tokens = 64,
do_sample = True,
low_memory = False
)
if answer_mode == 'fast':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.backends.cuda.preferred_blas_library="cublas"
# torch.backends.cuda.preferred_linalg_library="cusolver"
torch.set_float32_matmul_precision("medium")
with torch.no_grad():
vicuna_output = vicuna_model.generate(
**vicuna_input,
#max_new_tokens = 64,
min_new_tokens = 16,
do_sample = True,
low_memory = True
)
vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
vicuna_response = vicuna_response.replace(prompt, "").strip()
updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
try:
with torch.no_grad():
output = tts(vicuna_response)
wav = output["wav"]
sr = tts.fs
audio_arr = wav.cpu().numpy()
SAMPLE_RATE = sr
audio_arr = audio_arr / np.abs(audio_arr).max()
audio_output = (SAMPLE_RATE, audio_arr)
#sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file
except requests.exceptions.RequestException as e:
print(f"Error in Hugging Face API request: {e}")
audio_output = None
except Exception as e:
print(f"Error in speech synthesis: {e}")
audio_output = None
return updated_state, updated_state, audio_output
with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
with gr.Tab("Transcribe & Synthesize"):
with gr.Row(): # Added a row for better layout
image = gr.File(label="Image Prompt (Optional)")
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio")
audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
transcription_state = gr.State(value="")
mic_input.change(
fn=process_audio,
inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output]
)
audio_upload.change( # Added change event for upload
fn=process_audio,
inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output],
api_name='/api/predict'
)
if __name__ == '__main__':
demo.launch(share=False) |