opencampus
/

sign-whisper-german

@@ -42,47 +42,103 @@ TBD
 ### Training data
 TBD
-### Training process
-TBD
-### How to use
 ```python
 import torch
-from transformers import WhisperForConditionalGeneration, AutoProcessor, AutoTokenizer, TextStreamer
-from datasets import load_dataset
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# Load model and processor
-model = WhisperForConditionalGeneration.from_pretrained(
     "mrprimenotes/sign-whisper-german",
-    torch_dtype=torch_dtype,
-    low_cpu_mem_usage=True,
-    use_safetensors=True
-).to(device)
-# Load the tokenizer for the model (for decoding)
 tokenizer = AutoTokenizer.from_pretrained("mrprimenotes/sign-whisper-german")
-# input preprocessing / feature extraction (TBD)
-# input_features = ...
-```
-#### Use raw model for inference
-```python
-output = model(input_features, labels=generated_ids)
-# e.g. output.loss
-# output.shape --> b x sq
-tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
 ```
-### Use model with generate (work in progress...)
 ```python
 streamer = TextStreamer(tokenizer, skip_special_tokens=False) #only needed for streaming
 # Generate
 generated_ids = model.generate(
     input_features,
@@ -92,8 +148,4 @@ generated_ids = model.generate(
 )
 tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
-```
-### Training
-When changing the configuration of the preprocessing convolution layers make sure the last output has the shape b x 1280 x seq. See custom config in model.py for configuration options.

 ### Training data
 TBD
+#### Training process
 ```python
 import torch
+from transformers import WhisperForConditionalGeneration, AutoProcessor, AutoTokenizer, AutoConfig
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# When changing the configuration of the preprocessing convolution layers make sure their final output has the shape b x 1280 x seq.
+# See custom config in model.py for configuration options.
+config = AutoConfig.from_pretrained(
     "mrprimenotes/sign-whisper-german",
+    use_first_embeddings=True,
+    embedding_stride=2,
+    conv_dropout=0.1,
+    skip_connections=True,
+    conv_preprocessing_layers=[
+        {
+            "in_channels": 80,
+            "out_channels": 384,
+            "kernel_size": 5,
+            "padding": 2,
+            "activation": "gelu"
+        },
+        {
+            "in_channels": 384,
+            "out_channels": 384,
+            "kernel_size": 3,
+            "stride": 2,
+            "padding": 1,
+            "activation": "gelu"
+        }
+    ]
+)
 tokenizer = AutoTokenizer.from_pretrained("mrprimenotes/sign-whisper-german")
+# raw model outputs:
+# output = model(input_features, labels=labels)
+# e.g.
+# output.loss
+# output.shape --> b x sq
+train_dataset = YourSignDataset(...)
+val_dataset = YourSignDataset(...)
+# Define training arguments
+training_args = TrainingArguments(
+    output_dir="./sign-whisper-german",
+    num_train_epochs=3,
+    per_device_train_batch_size=1024,
+    per_device_eval_batch_size=256,
+    warmup_steps=500,
+    weight_decay=0.01,
+    # Logging settings
+    logging_dir="./logs",
+    logging_steps=50,
+    logging_strategy="steps",
+    # Evaluation
+    evaluation_strategy="steps",
+    eval_steps=100,
+    # Saving
+    save_strategy="steps",
+    save_steps=100,
+    save_total_limit=5,
+    resume_from_checkpoint=True,
+    load_best_model_at_end=True,
+    fp16=torch.cuda.is_available(),
+)
+# Initialize trainer with tokenizer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    tokenizer=tokenizer,
+)
+# Train the model
+trainer.train()
 ```
+### Use model for inference (with generate)
 ```python
+from transformers import TextStreamer
 streamer = TextStreamer(tokenizer, skip_special_tokens=False) #only needed for streaming
+# input preprocessing / feature extraction (TBD)
+# input_features = ...
 # Generate
 generated_ids = model.generate(
     input_features,
 )
 tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
+```