Spaces:
Sleeping
Sleeping
File size: 3,686 Bytes
41b1069 af2f3ed 4004c75 41b1069 af2f3ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from datasets import load_dataset, Audio
from transformers import AutoProcessor
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import evaluate
import streamlit as st
ds="PolyAI/minds14"
# ds = "RaysDipesh/obama-voice-samples-283"
minds = load_dataset(ds, name="en-US", split="train[:100]")
minds = minds.train_test_split(test_size=0.2)
minds
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]
def prepare_dataset(batch):
audio = batch["audio"]
batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
batch["input_length"] = len(batch["input_values"][0])
return batch
def uppercase(example):
return {"transcription": example["transcription"].upper()}
@dataclass
class DataCollatorCTCWithPadding:
processor: AutoProcessor
padding: Union[bool, str] = "longest"
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need
# different padding methods
input_features = [{"input_values": feature["input_values"][0]} for feature in features]
label_features = [{"input_ids": feature["labels"]} for feature in features]
batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
batch["labels"] = labels
return batch
minds = minds.map(uppercase)
encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
wer = evaluate.load("wer")
import numpy as np
def compute_metrics(pred):
pred_logits = pred.predictions
pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
pred_str = processor.batch_decode(pred_ids)
label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
wer = wer.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
from transformers import AutoModelForCTC, TrainingArguments, Trainer
model = AutoModelForCTC.from_pretrained(
"facebook/wav2vec2-base",
ctc_loss_reduction="mean",
pad_token_id=processor.tokenizer.pad_token_id,
)
training_args = TrainingArguments(
output_dir="my_awesome_asr_mind_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=2,
learning_rate=1e-5,
warmup_steps=500,
max_steps=2000,
gradient_checkpointing=True,
fp16=True,
group_by_length=True,
evaluation_strategy="steps",
per_device_eval_batch_size=8,
save_steps=1000,
eval_steps=1000,
logging_steps=25,
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_minds["train"],
eval_dataset=encoded_minds["test"],
tokenizer=processor,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
# steamlit
x = st.slider('Select a value')
st.write(x, 'squared is', x * x)
|