import spaces import torch import gradio as gr from transformers import pipeline, AutoModel, LlamaTokenizer, LlamaForCausalLM, InstructBlipForConditionalGeneration, InstructBlipProcessor import numpy as np #import yaml #import os import requests import nltk import scipy.io.wavfile import os import subprocess from huggingface_hub import hf_hub_download subprocess.run(['bash','llama.sh']) from llama_cpp import Llama os.environ["SAFETENSORS_FAST_GPU"] = "1" os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1") from espnet2.bin.tts_inference import Text2Speech repo_id = "Sosaka/Vicuna-7B-4bit-ggml" filename = "vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin" cache_dir="~/.cache/huggingface/hub" #hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir) ''' llm = Llama( model_path="~/.cache/huggingface/hub/vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin", n_gpu_layers=-1, # Uncomment to use GPU acceleration # seed=1337, # Uncomment to set a specific seed n_ctx=4096, # Uncomment to increase the context window ) llm = Llama.from_pretrained( repo_id="Sosaka/Vicuna-7B-4bit-ggml", filename="vicuna-7B-1.1-ggml_q4_0-ggjt_v3.bin", n_gpu_layers=-1, # Uncomment to use GPU acceleration n_ctx = 4096, verbose=False ) ''' try: nltk.data.find('taggers/averaged_perceptron_tagger_eng') except LookupError: nltk.download('averaged_perceptron_tagger_eng') try: nltk.data.find('corpora/cmudict') # Check for cmudict except LookupError: nltk.download('cmudict') ASR_MODEL_NAME = "openai/whisper-medium.en" asr_pipe = pipeline( task="automatic-speech-recognition", model=ASR_MODEL_NAME, chunk_length_s=30, device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available ) all_special_ids = asr_pipe.tokenizer.all_special_ids transcribe_token_id = all_special_ids[-5] translate_token_id = all_special_ids[-6] def _preload_and_load_models(): global vicuna_tokenizer, vicuna_model #VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5" # Or another model #VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5" # Or another model vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME) vicuna_model = LlamaForCausalLM.from_pretrained( VICUNA_MODEL_NAME, #torch_dtype=torch.float16, # device_map="auto", # or.to('cuda') ).to('cuda',torch.float16) # Explicitly move to CUDA after loading _preload_and_load_models() tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda') model5 = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to('cuda',torch.bfloat16) processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") cap_prompt = ( "Describe this image with a caption to be used for question answering." ) @spaces.GPU(required=True) def process_audio(img, microphone, audio_upload, state, answer_mode): # Added audio_upload audio_source = None if microphone: audio_source = microphone asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]] text = asr_pipe(audio_source)["text"] elif audio_upload: audio_source = audio_upload rate, data = scipy.io.wavfile.read(audio_source) asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]] text = asr_pipe(data)["text"] else: return state, state, None # No audio input system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9). You answer questions clearly and simply, using age-appropriate language. You are also a little bit silly and like to make jokes.""" prompt = f"{system_prompt}\nUser: {text}" if img is not None: sd_image_a = Image.open(img.name).convert('RGB') inputsa = processor5(images=sd_image_a, text=cap_prompt, return_tensors="pt").to('cuda') sd_image_a.resize((512,512), Image.LANCZOS) with torch.no_grad(): generated_ids = model5.generate( **inputsa, do_sample=True, num_beams=1, max_length=96, min_length=64, top_p=0.9, repetition_penalty=1.0, length_penalty=2.0, temperature=0.5, ) generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() print(generated_text) prompt = f"{system_prompt}\nImage: {generated_text}\nUser: {text}" with torch.no_grad(): vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda') if answer_mode == 'slow': torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False torch.backends.cudnn.allow_tf32 = False torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True torch.set_float32_matmul_precision("highest") vicuna_output = vicuna_model.generate( **vicuna_input, max_new_tokens = 512, min_new_tokens = 256, do_sample = True, low_memory = False ) ''' vicuna_output = llm( **vicuna_input, max_tokens=96, # Generate up to 32 tokens, set to None to generate up to the end of the context window stop=["Q:", "\n"], # Stop generating just before the model would generate a new question echo=True # Echo the prompt back in the output ) ''' if answer_mode == 'medium': torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False torch.set_float32_matmul_precision("high") vicuna_output = vicuna_model.generate( **vicuna_input, max_length = 192, min_new_tokens = 64, do_sample = True, low_memory = False ) if answer_mode == 'fast': torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # torch.backends.cuda.preferred_blas_library="cublas" # torch.backends.cuda.preferred_linalg_library="cusolver" torch.set_float32_matmul_precision("medium") with torch.no_grad(): vicuna_output = vicuna_model.generate( **vicuna_input, #max_new_tokens = 64, min_new_tokens = 16, do_sample = True, low_memory = True ) vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True) vicuna_response = vicuna_response.replace(prompt, "").strip() updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response try: with torch.no_grad(): output = tts(vicuna_response) wav = output["wav"] sr = tts.fs audio_arr = wav.cpu().numpy() SAMPLE_RATE = sr audio_arr = audio_arr / np.abs(audio_arr).max() audio_output = (SAMPLE_RATE, audio_arr) #sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file except requests.exceptions.RequestException as e: print(f"Error in Hugging Face API request: {e}") audio_output = None except Exception as e: print(f"Error in speech synthesis: {e}") audio_output = None return updated_state, updated_state, audio_output with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS") gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!") with gr.Tab("Transcribe & Synthesize"): with gr.Row(): # Added a row for better layout image = gr.File(label="Image Prompt (Optional)") mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio") audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response") audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True) answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium') transcription_state = gr.State(value="") mic_input.change( fn=process_audio, inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload outputs=[transcription_output, transcription_state, audio_output] ) audio_upload.change( # Added change event for upload fn=process_audio, inputs=[image, mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload outputs=[transcription_output, transcription_state, audio_output], api_name='/api/predict' ) if __name__ == '__main__': demo.launch(share=False)