FinVoc2Vec
We introduce FinVoc2Vec, a vocal tone classifier designed for real-world corporate disclosures. In the first stage, we apply a self-supervised pre-training procedure that allows the base model to adapt to the acoustic characteristics of disclosure environments using a sample of 500,000 unlabeled sentences of conference call speech. In the second stage, we apply a supervised fine-tuning procedure that enables the model to learn representations of human-labeled vocal tone. We construct a speech corpus containing 5,000 audio recordings of linguistically neutral sentences from conference calls and manually label each sentence with perceived vocal tone โ positive, negative, or neutral.
Example Usage
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModel
import torchaudio
@dataclass
class DataCollatorWithPadding:
processor: Union[Wav2Vec2Processor, Wav2Vec2FeatureExtractor]
padding: Union[bool, str] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
def __call__(self,
features: List[Dict[str, Union[List[int], torch.Tensor]]]
)-> Dict[str, torch.Tensor]:
input_features = [{"input_values": feature["input_values"]} for feature in features]
# trunc and pad max lengths, get attention mask
batch = self.processor.pad(
input_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt")
return batch
def preprocess_audio(batch: Dict,
feature_extractor: Wav2Vec2FeatureExtractor = None,
max_duration: Optional[float] = 20.0):
target_sr = feature_extractor.sampling_rate # 16kHz
audio_arrays = []
for path in batch['path']:
audio_array, sampling_rate = torchaudio.load(path)
# split to mono if multiple channels exist
if audio_array.shape[0] > 1:
audio_array = torch.mean(audio_array, dim=0, keepdim=True)
# resample audio
resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
audio_array = resampler(audio_array).squeeze().numpy()
audio_arrays.append(audio_array)
# set params for feature extractor
max_length = int(target_sr*max_duration) if max_duration is not None else None
# use feature extractor to normalize inputs and trunc data
result = feature_extractor(
audio_arrays,
sampling_rate=target_sr,
max_length=max_length,
truncation=bool(max_length))
return result
# load model
model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
# load feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("waiv/FinVoc2Vec")
# load dataset
# NOTE: Needed feature: 'path' -> path to the audio-data
test_dataset = load_dataset(r'path/to/dataset')
# preprocess audio data
test_dataset = test_dataset.map(
preprocess_audio,
batch_size=1000,
batched=True,
num_proc=4,
fn_kwargs={'feature_extractor': feature_extractor,
'max_duration': 20.0})
data_collator = DataCollatorWithPadding(feature_extractor)
data_loader = DataLoader(
test_dataset,
batch_size=16,
shuffle=False,
collate_fn=data_collator,
num_workers=4)
with torch.no_grad():
for batch in data_loader:
attention_mask, inputs = batch['attention_mask'], batch['input_values']
model_output = model(inputs, attention_mask=attention_mask)
logits = model_output['logits'].to(torch.float32)
probs = F.softmax(logits, dim=1).numpy()
label_to_id = model.config.label2id
dict_probs = {f'prob_negative': probs[:, label_to_id['negative']],
f'prob_neutral': probs[:, label_to_id['neutral']],
f'prob_positive': probs[:, label_to_id['positive']]}
Register for autoclass
To register the model for your local autoclass, use the following code:
from transformers import AutoConfig, AutoModel
# download model and config
finvoc2vec_config = AutoConfig.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
finvoc2vec_model = AutoModel.from_pretrained("waiv/FinVoc2Vec", trust_remote_code=True)
# register model and config for automodel class
AutoConfig.register("finvoc2vec", FinVoc2VecConfig)
AutoModel.register(FinVoc2VecConfig, FinVoc2Vec)
Further resources
Check the ๐ค Hugging Face Wav2Vec2 model description for additional resources and configurations.
License
- This model is a derivative work based on Wav2Vec2 (Apache-2.0)
- This model is licensed under the Creative Commons Attribution Non Commercial 4.0 licence (CC-BY-NC-4.0)
Paper
BibTeX
@article{ewertz2024,
title={Listen Closely: Measuring Vocal Tone in Corporate Disclosures},
author={Ewertz, Jonas and Knickrehm, Charlotte and Nienhaus, Martin and Reichmann, Doron},
year={2024},
note={Available at SSRN: \url{https://ssrn.com/abstract=4307178}}
}
- Downloads last month
- 6
Model tree for waiv/FinVoc2Vec
Base model
facebook/wav2vec2-large-xlsr-53