File size: 2,957 Bytes
8982daf
114afff
5aae4ef
8ba8aed
1d3d4a8
8982daf
1d3d4a8
71e5c37
1d3d4a8
5e53195
1d3d4a8
 
 
 
8982daf
 
 
1d3d4a8
8982daf
 
 
 
 
 
 
 
 
 
1d3d4a8
8982daf
 
 
 
1d3d4a8
8982daf
1d3d4a8
8982daf
 
1d3d4a8
8982daf
 
 
619b35a
3a603a5
8982daf
 
 
 
6e845d7
8982daf
6e845d7
8982daf
6e845d7
3a603a5
 
 
 
 
1d3d4a8
 
8982daf
 
 
 
 
3a603a5
1d3d4a8
 
8982daf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#Importing the required libraries
import gradio as gr   
import librosa 
from transformers import AutoFeatureExtractor, pipeline 

#Loading and fixing the audio file
def load_and_fix_data(input_file, model_sampling_rate):
    speech, sample_rate = librosa.load(input_file) 
    if len(speech.shape) > 1:
        speech = speech[:, 0] + speech[:, 1] 
    if sample_rate != model_sampling_rate:
        speech = librosa.resample(speech, sample_rate, model_sampling_rate)
    return speech

#Loading the feature extractor and setting up the pipeline
model_asr = "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_asr)
sampling_rate = feature_extractor.sampling_rate
asr = pipeline("automatic-speech-recognition", model=model_asr)

#Instantiating the pipeline for sentiment analysis
model_sentiment_classifier = "finiteautomata/beto-sentiment-analysis"
classifier = pipeline("sentiment-analysis", model = model_sentiment_classifier)

#Defining a function for speech-to_text conversion
def speech_to_text(speech):
    audio_transcription = asr(speech, chunk_length_s = 12, stride_length_s=1)["text"]
    return audio_transcription

#Defining a function to classify sentiment of the resulting audio transcription    
def sentiment_classifier(text):
    detected_sentiment = classifier(text)[0]["label"]
    return detected_sentiment

new_line = "\n\n\n"

#Defining a function that outputs audio transcription and the result of sentiment detection module
def asr_and_sentiment_detection(input_file):
    speech = load_and_fix_data(input_file, sampling_rate)
    transcription = speech_to_text(speech)
    sentiment = sentiment_classifier(transcription)
    return f"Audio Transcription :{transcription} {new_line} Detected Sentiment: {sentiment}"


inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")]
outputs = [gr.outputs.Textbox(label="Predicción")]
examples = [["audio_test.wav"], ["sample_audio.wav"], ["test1.wav"], ["test2.wav"], ["Example1.wav"]]
title = "Spanish ASR and Sentiment Classifier"

description = """ This is a Gradio demo for Spanish ASR and Sentiment Analysis. First, we do Speech to Text conversion, and then we perform sentiment analysis on the obtained transcription of the input audio. 

**Note regarding predicted labels : NEG --> NEGATIVE, NEU --> NEUTRAL, POS --> POSITIVE**

Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish)

Pre-trained model used for Sentiment Analysis of transcribed audio: [finiteautomata/beto-sentiment-analysis](https://huggingface.co/finiteautomata/beto-sentiment-analysis)
"""


gr.Interface(
    asr_and_sentiment_detection,
    inputs = inputs,
    outputs=outputs,
    examples=examples,
    title=title,
    description=description,
    layout="horizontal",
    theme="huggingface",
).launch(enable_queue=True)