opencampus
/

sign-whisper-german

@@ -50,36 +50,36 @@ from transformers import WhisperForConditionalGeneration, AutoProcessor, AutoTok
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# When changing the configuration of the preprocessing convolution layers make sure their final output has the shape b x 1280 x seq.
 # See custom config in model.py for configuration options.
 config = AutoConfig.from_pretrained(
     "mrprimenotes/sign-whisper-german",
     trust_remote_code=True,
     use_first_embeddings=True,
-    embedding_stride=2,
-    conv_dropout=0.1,
     skip_connections=True,
-    conv_preprocessing_layers=[
-          {
-              "in_channels": 128,
-              "out_channels": 1280,
-              "kernel_size": 3,
-              "stride": 1,
-              "padding": 1,
-              "activation": "gelu",
-              "bias": True
-          },
-          {
-              "in_channels": 1280,
-              "out_channels": 1280,
-              "kernel_size": 3,
-              "stride": 2,
-              "padding": 1,
-              "activation": "gelu",
-              "bias": True
-          }
-      ]
 )
 tokenizer = AutoTokenizer.from_pretrained("mrprimenotes/sign-whisper-german")
@@ -95,7 +95,7 @@ model = AutoModel.from_pretrained(
     device_map='auto'
 ).to(device)
-# raw model outputs:
 # output = model(input_features, labels=labels)
 # e.g.
 # output.loss
@@ -104,6 +104,9 @@ model = AutoModel.from_pretrained(
 train_dataset = YourSignDataset(...)
 val_dataset = YourSignDataset(...)
 # Define training arguments
 training_args = TrainingArguments(
     output_dir="./sign-whisper-german",

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 # See custom config in model.py for configuration options.
+# First load the config using AutoConfig
 config = AutoConfig.from_pretrained(
     "mrprimenotes/sign-whisper-german",
     trust_remote_code=True,
     use_first_embeddings=True,
+    #embedding_stride=2,
+    #conv_dropout=0.1,
     skip_connections=True,
+    conv_preprocessing_layers=[
+                { # When changing conv_preprocessing_layers make sure their final output has the shape b x 1280 x seq.
+                    "in_channels": 128,
+                    "out_channels": 1280,
+                    "kernel_size": 3,
+                    "stride": 1,
+                    "padding": 1,
+                    "activation": "gelu",
+                    "bias": True
+                },
+                {
+                    "in_channels": 1280,
+                    "out_channels": 1280,
+                    "kernel_size": 3,
+                    "stride": 1,
+                    "padding": 1,
+                    "activation": "gelu",
+                    "bias": True
+                }
+            ]
 )
 tokenizer = AutoTokenizer.from_pretrained("mrprimenotes/sign-whisper-german")
     device_map='auto'
 ).to(device)
+# You can see raw model outputs as follows:
 # output = model(input_features, labels=labels)
 # e.g.
 # output.loss
 train_dataset = YourSignDataset(...)
 val_dataset = YourSignDataset(...)
+# Freeze the decoder for our purpose
+model.freeze_decoder()
 # Define training arguments
 training_args = TrainingArguments(
     output_dir="./sign-whisper-german",