Luasmontesinos commited on
Commit
8c200c7
·
verified ·
1 Parent(s): 4fc3b61

Fix code: define device, fix imports, add pipeline

Browse files
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -1,54 +1,55 @@
1
  import torch
2
- from transformers import pipeline
 
3
  from datasets import load_dataset
 
 
 
 
 
 
 
4
 
5
  def translate(audio):
6
- outputs = pipe(audio, generate_kwargs={"task": "translate","max_new_tokens":256})
7
  return outputs["text"]
8
 
9
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
-
11
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
13
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
 
15
- model.to(device);
16
- vocoder.to(device);
17
 
 
18
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
19
  speaker_embeddings = torch.tensor(embeddings_dataset[6000]["xvector"]).unsqueeze(0)
20
-
21
 
22
  def synthesise(text):
23
  inputs = processor(text=text, return_tensors="pt")
24
  speech = model.generate_speech(
25
- inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
 
 
26
  )
27
  return speech.cpu()
28
 
29
- import numpy as np
30
-
31
  target_dtype = np.int16
32
  max_range = np.iinfo(target_dtype).max
33
 
34
-
35
  def speech_to_speech_translation(audio):
36
  translated_text = translate(audio)
37
  synthesised_speech = synthesise(translated_text)
38
  synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
39
  return 16000, synthesised_speech
40
-
41
-
42
- import gradio as gr
43
 
 
44
  demo = gr.Interface(
45
- # Indicamos la función que se usa para realizar las predicciones
46
  fn=speech_to_speech_translation,
47
- # Le indicamos la entrada, en este caso será un audio grabado desde el micrófono
48
  inputs=gr.Audio(sources="microphone", type="filepath"),
49
- # Le indicamos la salida, en este caso será un audio generado aplicando la función
50
- # indicada en fn al audio de entrada
51
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
52
  )
53
 
54
- demo.launch(debug=True)
 
1
  import torch
2
+ import numpy as np
3
+ from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
4
  from datasets import load_dataset
5
+ import gradio as gr
6
+
7
+ # Configuración del dispositivo (GPU si está disponible)
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+
10
+ # Pipeline de traducción automática de voz
11
+ pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
12
 
13
  def translate(audio):
14
+ outputs = pipe(audio, generate_kwargs={"task": "translate", "max_new_tokens": 256})
15
  return outputs["text"]
16
 
17
+ # Modelos para síntesis de voz
 
18
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
19
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
20
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
21
 
22
+ model.to(device)
23
+ vocoder.to(device)
24
 
25
+ # Embedding del hablante
26
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
27
  speaker_embeddings = torch.tensor(embeddings_dataset[6000]["xvector"]).unsqueeze(0)
 
28
 
29
  def synthesise(text):
30
  inputs = processor(text=text, return_tensors="pt")
31
  speech = model.generate_speech(
32
+ inputs["input_ids"].to(device),
33
+ speaker_embeddings.to(device),
34
+ vocoder=vocoder
35
  )
36
  return speech.cpu()
37
 
38
+ # Conversión final
 
39
  target_dtype = np.int16
40
  max_range = np.iinfo(target_dtype).max
41
 
 
42
  def speech_to_speech_translation(audio):
43
  translated_text = translate(audio)
44
  synthesised_speech = synthesise(translated_text)
45
  synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
46
  return 16000, synthesised_speech
 
 
 
47
 
48
+ # Interfaz Gradio
49
  demo = gr.Interface(
 
50
  fn=speech_to_speech_translation,
 
51
  inputs=gr.Audio(sources="microphone", type="filepath"),
 
 
52
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
53
  )
54
 
55
+ demo.launch(debug=True)