converted w2v2-large ckpt

Files changed (8) hide show

config.json +108 -0
dict.ltr.txt +28 -0
facebook/wav2vec2-large/config.json +109 -0
facebook/wav2vec2-large/model.safetensors +3 -0
facebook/wav2vec2-large/preprocessor_config.json +8 -0
libri960_big.pt +3 -0
run_convert.sh +16 -0
run_forward.py +152 -0

config.json ADDED Viewed

	@@ -0,0 +1,108 @@

+{
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "transformers_version": "4.49.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

dict.ltr.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+| 94802
+E 51860
+T 38431
+A 33152
+O 31495
+N 28855
+I 28794
+H 27187
+S 26071
+R 23546
+D 18289
+L 16308
+U 12400
+M 10685
+W 10317
+C 9844
+F 9062
+G 8924
+Y 8226
+P 6890
+B 6339
+V 3936
+K 3456
+' 1023
+X 636
+J 598
+Q 437
+Z 213

facebook/wav2vec2-large/config.json ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

facebook/wav2vec2-large/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca232d88f02bf12c5e87575148aa99e8b1d11c98f2c9251a89bfd4d3e5512040
+size 1269574136

facebook/wav2vec2-large/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "do_normalize": true,
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

libri960_big.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c078e25708237c540e307b2687422792b17b8f0df8b63b8b07a4ddcbef66955c
+size 3173903620

run_convert.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env bash
+hf_name=${1}
+ckpt=${2}
+dict=${3}
+curPath=$(pwd)
+cp ${dict} ${curPath}/dict.ltr.txt
+# load a config that is equal to the config of the model you wish to convert
+python -c "from transformers import Wav2Vec2Config; config = Wav2Vec2Config.from_pretrained('$hf_name'); config.save_pretrained('./');"
+# pretrained only
+eval "python /nlp/scr/askhan1/transformers/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py --pytorch_dump_folder ${hf_name} --checkpoint_path ${ckpt} --config_path ./config.json --not_finetuned"
+# fine-tuned
+#eval "python ../transformers/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py --pytorch_dump_folder ${hf_name} --checkpoint_path ${ckpt} --config_path ./config.json --dict_path ${curPath}/data/temp/dict.ltr.txt"

run_forward.py ADDED Viewed

	@@ -0,0 +1,152 @@

+#!/usr/bin/env python3
+import datasets
+import fairseq
+import torch
+import os
+import soundfile as sf
+from datasets import load_dataset
+import sys
+from shutil import copyfile
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
+hf_path = str(sys.argv[1])
+fairseq_wav2vec2_path = str(sys.argv[2])
+finetuned = bool(int(sys.argv[3]))
+if finetuned:
+    processor = Wav2Vec2Processor.from_pretrained(hf_path)
+    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+        [fairseq_wav2vec2_path], arg_overrides={"data": "../add_wav2vec/data/temp"}
+    )
+    hf_model = Wav2Vec2ForCTC.from_pretrained(hf_path)
+else:
+    processor = Wav2Vec2FeatureExtractor.from_pretrained(hf_path)
+    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([fairseq_wav2vec2_path])
+    hf_model = Wav2Vec2Model.from_pretrained(hf_path)
+model = model[0]
+model.eval()
+def test_feature_extractor(hf_feat_extractor, fsq_feat_extract, example_wav):
+    # set hf_feat_extractor.output to dummy
+    fsq_output = fsq_feat_extract(example_wav)
+    hf_output = hf_feat_extractor(example_wav)
+    assert (
+        hf_output.shape == fsq_output.shape
+    ), f"Shapes don't match. Got {hf_output.shape} for HF and {fsq_output.shape} for fsq"
+    assert torch.allclose(hf_output, fsq_output, atol=1e-3)
+def test_full_encoder(hf_model, fsq_model, example_wav, attention_mask):
+    fsq_output = fsq_model(example_wav, padding_mask=attention_mask.ne(1), mask=False, features_only=True)["x"]
+    hf_output = hf_model(example_wav, attention_mask=attention_mask)[0]
+    assert (
+        hf_output.shape == fsq_output.shape
+    ), f"Shapes don't match. Got {hf_output.shape} for HF and {fsq_output.shape} for fsq"
+    assert torch.allclose(hf_output, fsq_output, atol=1e-2)
+def test_full_model(hf_model, fsq_model, example_wav, attention_mask):
+    fsq_output = fsq_model(source=example_wav, padding_mask=attention_mask.ne(1))["encoder_out"]
+    hf_output = hf_model(example_wav, attention_mask=attention_mask)[0].transpose(0, 1)
+    assert (
+        hf_output.shape == fsq_output.shape
+    ), f"Shapes don't match. Got {hf_output.shape} for HF and {fsq_output.shape} for fsq"
+    assert torch.allclose(hf_output, fsq_output, atol=1e-2)
+def test_loss(hf_model, fsq_model, example_wav, attention_mask, target):
+    from fairseq.criterions.ctc import CtcCriterion, CtcCriterionConfig
+    from fairseq.tasks.audio_pretraining import AudioPretrainingConfig, AudioPretrainingTask
+    audio_cfg = AudioPretrainingConfig(labels="ltr", data="./data")
+    task = AudioPretrainingTask.setup_task(audio_cfg)
+    ctc = CtcCriterion(CtcCriterionConfig(), task)
+    fsq_model.train()
+    labels_dict = processor.tokenizer(target, padding="longest", return_tensors="pt")
+    labels = labels_dict.input_ids
+    target_lengths = labels_dict.attention_mask.sum(-1)
+    sample = {
+        "net_input": {
+            "source": example_wav,
+            "padding_mask": attention_mask.ne(1),
+        },
+        "target": labels,
+        "target_lengths": target_lengths,
+        "id": torch.zeros((1,)),
+    }
+    loss, _, _ = ctc(fsq_model, sample)
+    labels = labels_dict.attention_mask * labels + (1 - labels_dict.attention_mask) * -100
+    hf_model.config.ctc_loss_reduction = "mean"
+    hf_loss = hf_model(example_wav, attention_mask=attention_mask, labels=labels).loss
+    print("Loss", loss)
+    print("Hf loss", hf_loss)
+def test_all(example_wav, attention_mask):
+    with torch.no_grad():
+        if finetuned:
+            test_feature_extractor(
+                hf_model.wav2vec2.feature_extractor, model.w2v_encoder.w2v_model.feature_extractor, example_wav
+            )
+        else:
+            test_feature_extractor(
+                hf_model.feature_extractor, model.feature_extractor, example_wav
+            )
+    print("Succeded feature extractor Test")
+    with torch.no_grad():
+        # IMPORTANT: It is assumed that layer_norm_first is FALSE
+        # This is the case for `wav2vec_small_960h.pt`, but might not be for all models
+        # Adapt if necessary
+        if finetuned:
+            test_full_encoder(hf_model.wav2vec2, model.w2v_encoder.w2v_model, example_wav, attention_mask)
+        else:
+            test_full_encoder(hf_model, model, example_wav, attention_mask)
+    print("Succeded full encoder test")
+    if finetuned:
+        with torch.no_grad():
+            # IMPORTANT: It is assumed that layer_norm_first is FALSE
+            # This is the case for `wav2vec_small_960h.pt`, but might not be for all models
+            # Adapt if necessary
+            test_full_model(hf_model, model, example_wav, attention_mask)
+        print("Succeded full model test")
+dummy_speech_data = datasets.load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+def map_to_array(batch):
+    speech_array, _ = sf.read(batch["file"])
+    batch["speech"] = speech_array
+    return batch
+def map_to_array_mp3(batch, i):
+    speech_array, sr = sf.read(f"/home/patrick/hugging_face/add_wav2vec/common_voice/cv-corpus-6.1-2020-12-11/nl/converted/sample_{i}.wav")
+    batch["speech"] = speech_array
+    batch["sampling_rate"] = sr
+    return batch
+dummy_speech_data = dummy_speech_data.map(map_to_array, remove_columns=["file"])
+inputs = processor(dummy_speech_data[:3]["speech"], return_tensors="pt", padding="longest", return_attention_mask=True)
+transciption = dummy_speech_data[:3]["text"]
+input_values = inputs.input_values
+attention_mask = inputs.attention_mask
+test_all(input_values, attention_mask)
+#test_loss(hf_model, model, input_values, attention_mask, transciption)