IliaLarchenko commited on
Commit
1140640
·
verified ·
1 Parent(s): 65d6590

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +38 -0
  3. config.json +50 -0
  4. config.yaml +209 -0
  5. model.safetensors +3 -0
  6. replay.mp4 +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ replay.mp4 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: lerobot
3
+ tags:
4
+ - model_hub_mixin
5
+ - pytorch_model_hub_mixin
6
+ - robotics
7
+ - dot
8
+ license: apache-2.0
9
+ datasets:
10
+ - lerobot/pusht
11
+ pipeline_tag: robotics
12
+ ---
13
+
14
+ # Model Card for "Decoder Only Transformer (DOT) Policy" for PushT images dataset
15
+
16
+ Read more about the model and implementation details in the [DOT Policy repository](https://github.com/IliaLarchenko/dot_policy).
17
+
18
+ This model is trained using the [LeRobot library](https://huggingface.co/lerobot) and achieves state-of-the-art results on behavior cloning on the PushT images dataset. It achieves a 74.2% success rate (and 0.936 average max reward) vs. ~69% for the previous state-of-the-art model (Diffusion and VQ-BET perform the same).
19
+
20
+ This result is achieved without the checkpoint selection and is easy to reproduce.
21
+
22
+ You can use this model by installing LeRobot from [this branch](https://github.com/IliaLarchenko/lerobot/tree/dot)
23
+
24
+ To train the model:
25
+
26
+ ```bash
27
+ python lerobot/scripts/train.py policy=dot_pusht_image env=pusht
28
+ ```
29
+
30
+ To evaluate the model:
31
+
32
+ ```bash
33
+ python lerobot/scripts/eval.py -p IliaLarchenko/dot_pusht_images eval.n_episodes=1000 eval.batch_size=100 seed=1000000
34
+ ```
35
+
36
+ Model size:
37
+ - Total parameters: 14.1m
38
+ - Trainable parameters: 2.9m
config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 0.75,
3
+ "crop_scale": 0.8,
4
+ "dim_feedforward": 512,
5
+ "dim_model": 128,
6
+ "dropout": 0.1,
7
+ "inference_horizon": 20,
8
+ "input_normalization_modes": {
9
+ "observation.image": "mean_std",
10
+ "observation.state": "min_max"
11
+ },
12
+ "input_shapes": {
13
+ "observation.image": [
14
+ 3,
15
+ 96,
16
+ 96
17
+ ],
18
+ "observation.state": [
19
+ 2
20
+ ]
21
+ },
22
+ "lookback_aug": 5,
23
+ "lookback_obs_steps": 10,
24
+ "lora_rank": 20,
25
+ "merge_lora": true,
26
+ "n_decoder_layers": 8,
27
+ "n_heads": 8,
28
+ "n_obs_steps": 3,
29
+ "noise_decay": 0.999995,
30
+ "output_normalization_modes": {
31
+ "action": "min_max"
32
+ },
33
+ "output_shapes": {
34
+ "action": [
35
+ 2
36
+ ]
37
+ },
38
+ "pre_norm": true,
39
+ "predict_every_n": 1,
40
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
41
+ "rescale_shape": [
42
+ 96,
43
+ 96
44
+ ],
45
+ "return_every_n": 2,
46
+ "state_noise": 0.01,
47
+ "train_alpha": 0.9,
48
+ "train_horizon": 20,
49
+ "vision_backbone": "resnet18"
50
+ }
config.yaml ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resume: false
2
+ device: cuda
3
+ use_amp: true
4
+ seed: 100000
5
+ dataset_repo_id: lerobot/pusht
6
+ video_backend: pyav
7
+ training:
8
+ offline_steps: 1000000
9
+ num_workers: 24
10
+ batch_size: 24
11
+ eval_freq: 10000
12
+ log_freq: 1000
13
+ save_checkpoint: true
14
+ save_freq: 50000
15
+ online_steps: 0
16
+ online_rollout_n_episodes: 1
17
+ online_rollout_batch_size: 1
18
+ online_steps_between_rollouts: 1
19
+ online_sampling_ratio: 0.5
20
+ online_env_seed: null
21
+ online_buffer_capacity: null
22
+ online_buffer_seed_size: 0
23
+ do_online_rollout_async: false
24
+ image_transforms:
25
+ enable: false
26
+ max_num_transforms: 3
27
+ random_order: false
28
+ brightness:
29
+ weight: 1
30
+ min_max:
31
+ - 0.8
32
+ - 1.2
33
+ contrast:
34
+ weight: 1
35
+ min_max:
36
+ - 0.8
37
+ - 1.2
38
+ saturation:
39
+ weight: 1
40
+ min_max:
41
+ - 0.5
42
+ - 1.5
43
+ hue:
44
+ weight: 1
45
+ min_max:
46
+ - -0.05
47
+ - 0.05
48
+ sharpness:
49
+ weight: 1
50
+ min_max:
51
+ - 0.8
52
+ - 1.2
53
+ save_model: true
54
+ grad_clip_norm: 50
55
+ lr: 0.0001
56
+ min_lr: 0.0001
57
+ lr_cycle_steps: 300000
58
+ weight_decay: 1.0e-05
59
+ delta_timestamps:
60
+ observation.image:
61
+ - -1.5
62
+ - -1.4
63
+ - -1.3
64
+ - -1.2
65
+ - -1.1
66
+ - -1.0
67
+ - -0.9
68
+ - -0.8
69
+ - -0.7
70
+ - -0.6
71
+ - -0.5
72
+ - -0.1
73
+ - 0.0
74
+ observation.state:
75
+ - -1.5
76
+ - -1.4
77
+ - -1.3
78
+ - -1.2
79
+ - -1.1
80
+ - -1.0
81
+ - -0.9
82
+ - -0.8
83
+ - -0.7
84
+ - -0.6
85
+ - -0.5
86
+ - -0.1
87
+ - 0.0
88
+ action:
89
+ - -1.5
90
+ - -1.4
91
+ - -1.3
92
+ - -1.2
93
+ - -1.1
94
+ - -1.0
95
+ - -0.9
96
+ - -0.8
97
+ - -0.7
98
+ - -0.6
99
+ - -0.5
100
+ - -0.1
101
+ - 0.0
102
+ - 0.1
103
+ - 0.2
104
+ - 0.3
105
+ - 0.4
106
+ - 0.5
107
+ - 0.6
108
+ - 0.7
109
+ - 0.8
110
+ - 0.9
111
+ - 1.0
112
+ - 1.1
113
+ - 1.2
114
+ - 1.3
115
+ - 1.4
116
+ - 1.5
117
+ - 1.6
118
+ - 1.7
119
+ - 1.8
120
+ - 1.9
121
+ eval:
122
+ n_episodes: 100
123
+ batch_size: 100
124
+ use_async_envs: false
125
+ wandb:
126
+ enable: true
127
+ disable_artifact: false
128
+ project: lerobot
129
+ notes: ''
130
+ fps: 10
131
+ env:
132
+ name: pusht
133
+ task: PushT-v0
134
+ image_size: 96
135
+ state_dim: 2
136
+ action_dim: 2
137
+ fps: ${fps}
138
+ episode_length: 300
139
+ gym:
140
+ obs_type: pixels_agent_pos
141
+ render_mode: rgb_array
142
+ visualization_width: 384
143
+ visualization_height: 384
144
+ override_dataset_stats:
145
+ observation.image:
146
+ mean:
147
+ - - - 0.485
148
+ - - - 0.456
149
+ - - - 0.406
150
+ std:
151
+ - - - 0.229
152
+ - - - 0.224
153
+ - - - 0.225
154
+ observation.state:
155
+ min:
156
+ - 0.0
157
+ - 0.0
158
+ max:
159
+ - 512.0
160
+ - 512.0
161
+ action:
162
+ min:
163
+ - 0.0
164
+ - 0.0
165
+ max:
166
+ - 512.0
167
+ - 512.0
168
+ policy:
169
+ name: dot
170
+ n_obs_steps: 3
171
+ train_horizon: 20
172
+ inference_horizon: 20
173
+ lookback_obs_steps: 10
174
+ lookback_aug: 5
175
+ input_shapes:
176
+ observation.image:
177
+ - 3
178
+ - 96
179
+ - 96
180
+ observation.state:
181
+ - ${env.state_dim}
182
+ output_shapes:
183
+ action:
184
+ - ${env.action_dim}
185
+ input_normalization_modes:
186
+ observation.image: mean_std
187
+ observation.state: min_max
188
+ output_normalization_modes:
189
+ action: min_max
190
+ vision_backbone: resnet18
191
+ pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
192
+ rescale_shape:
193
+ - 96
194
+ - 96
195
+ lora_rank: 20
196
+ merge_lora: true
197
+ crop_scale: 0.8
198
+ state_noise: 0.01
199
+ noise_decay: 0.999995
200
+ pre_norm: true
201
+ dim_model: 128
202
+ n_heads: 8
203
+ dim_feedforward: 512
204
+ n_decoder_layers: 8
205
+ dropout: 0.1
206
+ alpha: 0.75
207
+ train_alpha: 0.9
208
+ predict_every_n: 1
209
+ return_every_n: 2
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f87cdd4cc31b979724c8bc0dde0076f5d533398d0a4f419edf5b961e38ee460
3
+ size 56412020
replay.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c125bf9bc41d5c9eba693a5f9d1781148b5629910429d7c9b477c65eb33d53e0
3
+ size 146456