File size: 2,957 Bytes
8982daf 114afff 5aae4ef 8ba8aed 1d3d4a8 8982daf 1d3d4a8 71e5c37 1d3d4a8 5e53195 1d3d4a8 8982daf 1d3d4a8 8982daf 1d3d4a8 8982daf 1d3d4a8 8982daf 1d3d4a8 8982daf 1d3d4a8 8982daf 619b35a 3a603a5 8982daf 6e845d7 8982daf 6e845d7 8982daf 6e845d7 3a603a5 1d3d4a8 8982daf 3a603a5 1d3d4a8 8982daf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
#Importing the required libraries
import gradio as gr
import librosa
from transformers import AutoFeatureExtractor, pipeline
#Loading and fixing the audio file
def load_and_fix_data(input_file, model_sampling_rate):
speech, sample_rate = librosa.load(input_file)
if len(speech.shape) > 1:
speech = speech[:, 0] + speech[:, 1]
if sample_rate != model_sampling_rate:
speech = librosa.resample(speech, sample_rate, model_sampling_rate)
return speech
#Loading the feature extractor and setting up the pipeline
model_asr = "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_asr)
sampling_rate = feature_extractor.sampling_rate
asr = pipeline("automatic-speech-recognition", model=model_asr)
#Instantiating the pipeline for sentiment analysis
model_sentiment_classifier = "finiteautomata/beto-sentiment-analysis"
classifier = pipeline("sentiment-analysis", model = model_sentiment_classifier)
#Defining a function for speech-to_text conversion
def speech_to_text(speech):
audio_transcription = asr(speech, chunk_length_s = 12, stride_length_s=1)["text"]
return audio_transcription
#Defining a function to classify sentiment of the resulting audio transcription
def sentiment_classifier(text):
detected_sentiment = classifier(text)[0]["label"]
return detected_sentiment
new_line = "\n\n\n"
#Defining a function that outputs audio transcription and the result of sentiment detection module
def asr_and_sentiment_detection(input_file):
speech = load_and_fix_data(input_file, sampling_rate)
transcription = speech_to_text(speech)
sentiment = sentiment_classifier(transcription)
return f"Audio Transcription :{transcription} {new_line} Detected Sentiment: {sentiment}"
inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")]
outputs = [gr.outputs.Textbox(label="Predicción")]
examples = [["audio_test.wav"], ["sample_audio.wav"], ["test1.wav"], ["test2.wav"], ["Example1.wav"]]
title = "Spanish ASR and Sentiment Classifier"
description = """ This is a Gradio demo for Spanish ASR and Sentiment Analysis. First, we do Speech to Text conversion, and then we perform sentiment analysis on the obtained transcription of the input audio.
**Note regarding predicted labels : NEG --> NEGATIVE, NEU --> NEUTRAL, POS --> POSITIVE**
Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish)
Pre-trained model used for Sentiment Analysis of transcribed audio: [finiteautomata/beto-sentiment-analysis](https://huggingface.co/finiteautomata/beto-sentiment-analysis)
"""
gr.Interface(
asr_and_sentiment_detection,
inputs = inputs,
outputs=outputs,
examples=examples,
title=title,
description=description,
layout="horizontal",
theme="huggingface",
).launch(enable_queue=True)
|