{ "_name_or_path": "MIT/ast-finetuned-speech-commands-v2", "architectures": [ "ASTForAudioClassification" ], "attention_probs_dropout_prob": 0.0, "frequency_stride": 10, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "A", "1": "B", "10": "K", "11": "L", "12": "M", "13": "N", "14": "O", "15": "P", "16": "Q", "17": "R", "18": "S", "19": "T", "2": "C", "20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z", "26": "_silence_", "27": "_unknown_", "3": "D", "4": "E", "5": "F", "6": "G", "7": "H", "8": "I", "9": "J" }, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "A": "0", "B": "1", "C": "2", "D": "3", "E": "4", "F": "5", "G": "6", "H": "7", "I": "8", "J": "9", "K": "10", "L": "11", "M": "12", "N": "13", "O": "14", "P": "15", "Q": "16", "R": "17", "S": "18", "T": "19", "U": "20", "V": "21", "W": "22", "X": "23", "Y": "24", "Z": "25", "_silence_": "26", "_unknown_": "27" }, "layer_norm_eps": 1e-12, "max_length": 128, "model_type": "audio-spectrogram-transformer", "num_attention_heads": 12, "num_hidden_layers": 12, "num_mel_bins": 128, "patch_size": 16, "problem_type": "single_label_classification", "qkv_bias": true, "time_stride": 10, "torch_dtype": "float32", "transformers_version": "4.47.1" }