billy-sunday commited on
Commit
41b1069
·
1 Parent(s): 4004c75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -3
app.py CHANGED
@@ -1,4 +1,119 @@
1
- import streamlit as st
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Audio
2
+ from transformers import AutoProcessor
3
+ import torch
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Dict, List, Optional, Union
6
+ import evaluate
7
 
8
+
9
+ ds="PolyAI/minds14"
10
+ # ds = "RaysDipesh/obama-voice-samples-283"
11
+ minds = load_dataset(ds, name="en-US", split="train[:100]")
12
+ minds = minds.train_test_split(test_size=0.2)
13
+ minds
14
+
15
+ minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
16
+
17
+ processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
18
+
19
+ minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
20
+ minds["train"][0]
21
+
22
+
23
+
24
+
25
+
26
+ def prepare_dataset(batch):
27
+ audio = batch["audio"]
28
+ batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
29
+ batch["input_length"] = len(batch["input_values"][0])
30
+ return batch
31
+
32
+ def uppercase(example):
33
+ return {"transcription": example["transcription"].upper()}
34
+
35
+ @dataclass
36
+ class DataCollatorCTCWithPadding:
37
+ processor: AutoProcessor
38
+ padding: Union[bool, str] = "longest"
39
+
40
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
41
+ # split inputs and labels since they have to be of different lengths and need
42
+ # different padding methods
43
+ input_features = [{"input_values": feature["input_values"][0]} for feature in features]
44
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
45
+
46
+ batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
47
+
48
+ labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
49
+ # replace padding with -100 to ignore loss correctly
50
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
51
+ batch["labels"] = labels
52
+ return batch
53
+
54
+
55
+ minds = minds.map(uppercase)
56
+ encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
57
+
58
+ data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
59
+
60
+
61
+ wer = evaluate.load("wer")
62
+
63
+ import numpy as np
64
+
65
+
66
+ def compute_metrics(pred):
67
+ pred_logits = pred.predictions
68
+ pred_ids = np.argmax(pred_logits, axis=-1)
69
+
70
+ pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
71
+
72
+ pred_str = processor.batch_decode(pred_ids)
73
+ label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
74
+
75
+ wer = wer.compute(predictions=pred_str, references=label_str)
76
+
77
+ return {"wer": wer}
78
+
79
+ from transformers import AutoModelForCTC, TrainingArguments, Trainer
80
+
81
+ model = AutoModelForCTC.from_pretrained(
82
+ "facebook/wav2vec2-base",
83
+ ctc_loss_reduction="mean",
84
+ pad_token_id=processor.tokenizer.pad_token_id,
85
+ )
86
+
87
+
88
+ training_args = TrainingArguments(
89
+ output_dir="my_awesome_asr_mind_model",
90
+ per_device_train_batch_size=8,
91
+ gradient_accumulation_steps=2,
92
+ learning_rate=1e-5,
93
+ warmup_steps=500,
94
+ max_steps=2000,
95
+ gradient_checkpointing=True,
96
+ fp16=True,
97
+ group_by_length=True,
98
+ evaluation_strategy="steps",
99
+ per_device_eval_batch_size=8,
100
+ save_steps=1000,
101
+ eval_steps=1000,
102
+ logging_steps=25,
103
+ load_best_model_at_end=True,
104
+ metric_for_best_model="wer",
105
+ greater_is_better=False,
106
+ push_to_hub=True,
107
+ )
108
+
109
+ trainer = Trainer(
110
+ model=model,
111
+ args=training_args,
112
+ train_dataset=encoded_minds["train"],
113
+ eval_dataset=encoded_minds["test"],
114
+ tokenizer=processor,
115
+ data_collator=data_collator,
116
+ compute_metrics=compute_metrics,
117
+ )
118
+
119
+ trainer.train()