|
model: |
|
_target_: gr00t.model.idm.IDM |
|
_convert_: object |
|
config: |
|
_target_: gr00t.model.idm.IDMConfig |
|
_recursive_: false |
|
model_dtype: float32 |
|
hidden_size: 0 |
|
action_horizon: 16 |
|
action_dim: 32 |
|
backbone_cfg: |
|
_target_: gr00t.model.backbone.IdentityBackbone |
|
action_head_cfg: |
|
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM |
|
_convert_: object |
|
config: |
|
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig |
|
_recursive_: false |
|
add_seperator_token: true |
|
add_pos_embed: true |
|
model_dtype: float32 |
|
mm_vision_select_layer: -2 |
|
max_state_dim: 44 |
|
max_action_dim: 32 |
|
hidden_size: 1024 |
|
tune_vision_tower: true |
|
add_view_embed: true |
|
max_num_views: 6 |
|
siglip_model_cfg: |
|
_target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
siglip_hidden_size: 1024 |
|
vl_self_attention_cfg: |
|
_target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer |
|
positional_embeddings: null |
|
num_layers: 4 |
|
num_attention_heads: 16 |
|
attention_head_dim: 64 |
|
dropout: 0.2 |
|
final_dropout: true |
|
diffusion_model_cfg: |
|
_target_: gr00t.model.action_head.cross_attention_dit.DiT |
|
positional_embeddings: null |
|
num_layers: 8 |
|
num_attention_heads: 16 |
|
attention_head_dim: 64 |
|
norm_type: ada_norm |
|
dropout: 0.2 |
|
final_dropout: true |
|
output_dim: 1024 |
|
interleave_self_attention: true |
|
mm_projector_cfg: |
|
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector |
|
_convert_: object |
|
config: |
|
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig |
|
hidden_size: 1024 |
|
mm_hidden_size: 1024 |
|
mm_projector_type: mlp_doubledownsample |
|
action_dim: 32 |
|
action_horizon: 16 |
|
num_inference_timesteps: 16 |
|
noise_beta_alpha: 1.5 |
|
noise_beta_beta: 1.0 |
|
noise_s: 0.999 |
|
num_timestep_buckets: 1000 |
|
backbone_features_projector_cfg: null |
|
train_dataset: |
|
_target_: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec |
|
_convert_: object |
|
mixture_spec: |
|
- dataset_path: |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseDoubleDoor256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseDrawer256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseSingleDoor256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeePressButton256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeeServeMug256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeeSetupMug256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenDoubleDoor256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenDrawer256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenSingleDoor256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCabToCounter256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToCab256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToMicrowave256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToSink256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToStove256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPMicrowaveToCounter256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPSinkToCounter256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPStoveToCounter256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffMicrowave256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffSinkFaucet256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffStove256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnMicrowave256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnSinkFaucet256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnStove256_300 |
|
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnSinkSpout256_300 |
|
dataset_weight: 1.0 |
|
dataset_class: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset |
|
all_modality_configs: |
|
robocasa_gr1_arms_only_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_gr1_arms_waist_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_gr1_fixed_lower_body_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_bimanual_panda_parallel_gripper: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_bimanual_panda_inspire_hand: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_panda_omron: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
gr1_unified: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.coarse_action |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_droid: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_fractal: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.image_pad_res256_freq03 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.natural_language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_language_table: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.rgb_pad_res256_freq10 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.effector_translation |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.action |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_bridge: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
agibot: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.agibot.task_description |
|
all_transforms: |
|
robocasa_gr1_arms_only_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_gr1_arms_waist_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
state.waist: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
action.waist: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
state.waist: min_max |
|
state.neck: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
action.waist: min_max |
|
action.neck: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_bimanual_panda_parallel_gripper: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
normalization_modes: |
|
state.right_arm_eef_pos: min_max |
|
state.right_gripper_qpos: min_max |
|
state.left_arm_eef_pos: min_max |
|
state.left_gripper_qpos: min_max |
|
target_rotations: |
|
state.right_arm_eef_quat: rotation_6d |
|
state.left_arm_eef_quat: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
normalization_modes: |
|
action.right_gripper_close: binary |
|
action.left_gripper_close: binary |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
action_concat_order: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_bimanual_panda_inspire_hand: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
normalization_modes: |
|
state.right_arm_eef_pos: min_max |
|
state.right_hand: min_max |
|
state.left_arm_eef_pos: min_max |
|
state.left_hand: min_max |
|
target_rotations: |
|
state.right_arm_eef_quat: rotation_6d |
|
state.left_arm_eef_quat: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
normalization_modes: |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
action_concat_order: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_panda_omron: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
normalization_modes: |
|
state.end_effector_position_relative: min_max |
|
state.end_effector_rotation_relative: min_max |
|
state.gripper_qpos: min_max |
|
state.base_position: min_max |
|
state.base_rotation: min_max |
|
target_rotations: |
|
state.end_effector_rotation_relative: rotation_6d |
|
state.base_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
normalization_modes: |
|
action.end_effector_position: min_max |
|
action.end_effector_rotation: min_max |
|
action.gripper_close: binary |
|
action.base_motion: min_max |
|
action.control_mode: binary |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
state_concat_order: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
action_concat_order: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
gr1_unified: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
normalization_modes: |
|
state.left_arm: scale |
|
state.right_arm: scale |
|
state.left_hand: scale |
|
state.right_hand: scale |
|
state.waist: scale |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
normalization_modes: |
|
action.left_arm: scale |
|
action.right_arm: scale |
|
action.left_hand: scale |
|
action.right_hand: scale |
|
action.waist: scale |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_droid: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_position: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.eef_rotation_delta: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
action_concat_order: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_fractal: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_closedness_commanded: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.rotation_delta: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.image_pad_res256_freq03 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
action_concat_order: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_language_table: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.effector_translation |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.effector_translation |
|
normalization_modes: |
|
state.effector_translation: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.action |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.action |
|
normalization_modes: |
|
action.action: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.rgb_pad_res256_freq10 |
|
state_concat_order: |
|
- state.effector_translation |
|
action_concat_order: |
|
- action.action |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_bridge: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_closed: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.eef_rotation: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
action_concat_order: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
agibot: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
normalization_modes: |
|
state.left_arm_joint_position: min_max |
|
state.right_arm_joint_position: min_max |
|
state.left_effector_position: min_max |
|
state.right_effector_position: min_max |
|
state.head_position: min_max |
|
state.waist_position: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
normalization_modes: |
|
action.left_arm_joint_position: min_max |
|
action.right_arm_joint_position: min_max |
|
action.left_effector_position: min_max |
|
action.right_effector_position: min_max |
|
action.head_position: min_max |
|
action.waist_position: min_max |
|
action.robot_velocity: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
state_concat_order: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
action_concat_order: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
metadata_versions: |
|
robocasa_gr1_arms_only_fourier_hands: '0217' |
|
robocasa_gr1_fixed_lower_body_fourier_hands: '0217' |
|
robocasa_bimanual_panda_parallel_gripper: '0217' |
|
robocasa_bimanual_panda_inspire_hand: '0217' |
|
robocasa_panda_omron: '0217' |
|
gr1_unified: '0225' |
|
oxe_droid: '0221' |
|
oxe_fractal: '0221' |
|
oxe_language_table: '0221' |
|
oxe_bridge: '0221' |
|
robocasa_gr1_arms_waist_fourier_hands: '0225' |
|
agibot: '0225' |
|
dataset_kwargs: |
|
video_backend: decord |
|
mixture_kwargs: |
|
training: true |
|
balance_dataset_weights: true |
|
seed: 42 |
|
trainer: |
|
_target_: gr00t.experiment.dual_brain.experiment.DualBrainTrainer |
|
_partial_: true |
|
_recursive_: false |
|
callbacks: null |
|
model: ??? |
|
train_dataset: ??? |
|
compute_dtype: ??? |
|
benchmark_time: false |
|
enable_profiling: false |
|
profiling_steps: 5 |
|
wandb_project: dream_idm |
|
output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_idm_24P_300 |
|
load_from_yaml: null |
|
gear_credentials: /mnt/amlfs-01/home/seonghyeony/.gear/data_credentials |
|
upload_checkpoints: false |
|
upload_every: 10000 |
|
upload_last_n_checkpoints: 5 |
|
remove_unused_columns: false |
|
bf16: true |
|
tf32: true |
|
global_batch_size: 1024 |
|
raise_error_if_global_batch_size_not_set: true |
|
per_device_train_batch_size: 32 |
|
per_device_eval_batch_size: 64 |
|
gradient_accumulation_steps: 1 |
|
dataloader_num_workers: 6 |
|
dataloader_pin_memory: false |
|
dataloader_persistent_workers: true |
|
optim: adamw_torch |
|
learning_rate: 0.0001 |
|
adam_beta1: 0.95 |
|
adam_beta2: 0.999 |
|
adam_epsilon: 1.0e-08 |
|
weight_decay: 1.0e-05 |
|
lr_scheduler_type: cosine |
|
warmup_ratio: 0.05 |
|
logging_steps: 10.0 |
|
num_train_epochs: 1000 |
|
max_steps: 60000 |
|
save_strategy: steps |
|
save_steps: 1000 |
|
eval_strategy: 'no' |
|
save_total_limit: 20 |
|
report_to: wandb |
|
seed: 42 |
|
do_eval: false |
|
gradient_checkpointing: false |
|
ddp_find_unused_parameters: false |
|
ddp_bucket_cap_mb: 100 |
|
ray_num_workers: 32 |
|
eval_bf16: true |
|
torch_compile_mode: null |
|
pretrained_model_path: null |
|
only_tune_projectors: false |
|
training_args: |
|
_target_: transformers.TrainingArguments |
|
output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_idm_24P_300 |
|
run_name: gr00t_s_idm_24P_300 |
|
remove_unused_columns: false |
|
deepspeed: gr00t/gr00t/experiment/dual_brain/configs/deepspeed/zero2.json |
|
gradient_checkpointing: false |
|
bf16: true |
|
tf32: true |
|
per_device_train_batch_size: 32 |
|
per_device_eval_batch_size: 64 |
|
gradient_accumulation_steps: 1 |
|
dataloader_num_workers: 6 |
|
dataloader_pin_memory: false |
|
dataloader_persistent_workers: true |
|
optim: adamw_torch |
|
adam_beta1: 0.95 |
|
adam_beta2: 0.999 |
|
adam_epsilon: 1.0e-08 |
|
learning_rate: 0.0001 |
|
weight_decay: 1.0e-05 |
|
warmup_ratio: 0.05 |
|
lr_scheduler_type: cosine |
|
logging_steps: 10.0 |
|
num_train_epochs: 1000 |
|
max_steps: 60000 |
|
save_strategy: steps |
|
save_steps: 1000 |
|
save_total_limit: 20 |
|
report_to: wandb |
|
seed: 42 |
|
do_eval: false |
|
ddp_find_unused_parameters: false |
|
ddp_bucket_cap_mb: 100 |
|
torch_compile_mode: null |
|
add_seperator_token: true |
|
add_pos_embed: true |
|
hidden_size: 1024 |
|
attn_dropout: 0.2 |
|
siglip_hidden_size: 1024 |
|
siglip_version: google/siglip2-large-patch16-256 |
|
action_head_cfg: |
|
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM |
|
_convert_: object |
|
config: |
|
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig |
|
_recursive_: false |
|
add_seperator_token: true |
|
add_pos_embed: true |
|
model_dtype: float32 |
|
mm_vision_select_layer: -2 |
|
max_state_dim: 44 |
|
max_action_dim: 32 |
|
hidden_size: 1024 |
|
tune_vision_tower: true |
|
add_view_embed: true |
|
max_num_views: 6 |
|
siglip_model_cfg: |
|
_target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
siglip_hidden_size: 1024 |
|
vl_self_attention_cfg: |
|
_target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer |
|
positional_embeddings: null |
|
num_layers: 4 |
|
num_attention_heads: 16 |
|
attention_head_dim: 64 |
|
dropout: 0.2 |
|
final_dropout: true |
|
diffusion_model_cfg: |
|
_target_: gr00t.model.action_head.cross_attention_dit.DiT |
|
positional_embeddings: null |
|
num_layers: 8 |
|
num_attention_heads: 16 |
|
attention_head_dim: 64 |
|
norm_type: ada_norm |
|
dropout: 0.2 |
|
final_dropout: true |
|
output_dim: 1024 |
|
interleave_self_attention: true |
|
mm_projector_cfg: |
|
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector |
|
_convert_: object |
|
config: |
|
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig |
|
hidden_size: 1024 |
|
mm_hidden_size: 1024 |
|
mm_projector_type: mlp_doubledownsample |
|
action_dim: 32 |
|
action_horizon: 16 |
|
num_inference_timesteps: 16 |
|
noise_beta_alpha: 1.5 |
|
noise_beta_beta: 1.0 |
|
noise_s: 0.999 |
|
num_timestep_buckets: 1000 |
|
backbone_features_projector_cfg: null |
|
backbone_hidden_size: 0 |
|
backbone_cfg: |
|
_target_: gr00t.model.backbone.IdentityBackbone |
|
embodiment_tag_to_projector_index: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
num_visual_tokens_per_frame: 16 |
|
max_action_dim: 32 |
|
language_dropout_prob: 0.0 |
|
model_image_resolution: 224 |
|
max_sequence_length: 112 |
|
model_specific_transform: |
|
_target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
data_collator: |
|
_target_: gr00t.model.transforms_idm.DefaultDataCollatorGR00TIDM |
|
action_horizon: 16 |
|
totensor_cfg: |
|
_target_: gr00t.data.transform.VideoToTensor |
|
apply_to: ??? |
|
crop_cfg: |
|
_target_: gr00t.data.transform.VideoCrop |
|
apply_to: ??? |
|
scale: 0.95 |
|
mode: random |
|
resize_cfg: |
|
_target_: gr00t.data.transform.VideoResize |
|
apply_to: ??? |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
color_jitter_cfg: |
|
_target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: ??? |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
to_numpy_cfg: |
|
_target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: ??? |
|
modality_config_robocasa_gr1_arms_only_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_robocasa_gr1_arms_only_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_robocasa_gr1_arms_waist_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_robocasa_gr1_arms_waist_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
state.waist: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
action.waist: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_robocasa_panda_omron: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_robocasa_panda_omron: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
normalization_modes: |
|
state.end_effector_position_relative: min_max |
|
state.end_effector_rotation_relative: min_max |
|
state.gripper_qpos: min_max |
|
state.base_position: min_max |
|
state.base_rotation: min_max |
|
target_rotations: |
|
state.end_effector_rotation_relative: rotation_6d |
|
state.base_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
normalization_modes: |
|
action.end_effector_position: min_max |
|
action.end_effector_rotation: min_max |
|
action.gripper_close: binary |
|
action.base_motion: min_max |
|
action.control_mode: binary |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
state_concat_order: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
action_concat_order: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_robocasa_gr1_fixed_lower_body_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_robocasa_gr1_fixed_lower_body_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
state.waist: min_max |
|
state.neck: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
action.waist: min_max |
|
action.neck: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_robocasa_bimanual_panda_parallel_gripper: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_robocasa_bimanual_panda_parallel_gripper: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
normalization_modes: |
|
state.right_arm_eef_pos: min_max |
|
state.right_gripper_qpos: min_max |
|
state.left_arm_eef_pos: min_max |
|
state.left_gripper_qpos: min_max |
|
target_rotations: |
|
state.right_arm_eef_quat: rotation_6d |
|
state.left_arm_eef_quat: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
normalization_modes: |
|
action.right_gripper_close: binary |
|
action.left_gripper_close: binary |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
action_concat_order: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_robocasa_bimanual_panda_inspire_hand: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_robocasa_bimanual_panda_inspire_hand: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
normalization_modes: |
|
state.right_arm_eef_pos: min_max |
|
state.right_hand: min_max |
|
state.left_arm_eef_pos: min_max |
|
state.left_hand: min_max |
|
target_rotations: |
|
state.right_arm_eef_quat: rotation_6d |
|
state.left_arm_eef_quat: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
normalization_modes: |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
action_concat_order: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_gr1_unified: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.coarse_action |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_gr1_unified: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
normalization_modes: |
|
state.left_arm: scale |
|
state.right_arm: scale |
|
state.left_hand: scale |
|
state.right_hand: scale |
|
state.waist: scale |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
normalization_modes: |
|
action.left_arm: scale |
|
action.right_arm: scale |
|
action.left_hand: scale |
|
action.right_hand: scale |
|
action.waist: scale |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_oxe_droid: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_oxe_droid: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_position: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.eef_rotation_delta: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
action_concat_order: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_oxe_fractal: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.image_pad_res256_freq03 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.natural_language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_oxe_fractal: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_closedness_commanded: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.rotation_delta: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.image_pad_res256_freq03 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
action_concat_order: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_oxe_language_table: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.rgb_pad_res256_freq10 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.effector_translation |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.action |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_oxe_language_table: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.effector_translation |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.effector_translation |
|
normalization_modes: |
|
state.effector_translation: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.action |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.action |
|
normalization_modes: |
|
action.action: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.rgb_pad_res256_freq10 |
|
state_concat_order: |
|
- state.effector_translation |
|
action_concat_order: |
|
- action.action |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_oxe_bridge: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
transform_oxe_bridge: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_closed: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.eef_rotation: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
action_concat_order: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_config_agibot: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.agibot.task_description |
|
transform_agibot: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
normalization_modes: |
|
state.left_arm_joint_position: min_max |
|
state.right_arm_joint_position: min_max |
|
state.left_effector_position: min_max |
|
state.right_effector_position: min_max |
|
state.head_position: min_max |
|
state.waist_position: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
normalization_modes: |
|
action.left_arm_joint_position: min_max |
|
action.right_arm_joint_position: min_max |
|
action.left_effector_position: min_max |
|
action.right_effector_position: min_max |
|
action.head_position: min_max |
|
action.waist_position: min_max |
|
action.robot_velocity: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
state_concat_order: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
action_concat_order: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
modality_configs: |
|
robocasa_gr1_arms_only_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_gr1_arms_waist_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_gr1_fixed_lower_body_fourier_hands: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_bimanual_panda_parallel_gripper: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_bimanual_panda_inspire_hand: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
robocasa_panda_omron: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.action.task_description |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
gr1_unified: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.ego_view_pad_res256_freq20 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.human.coarse_action |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_droid: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_fractal: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.image_pad_res256_freq03 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.natural_language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_language_table: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.rgb_pad_res256_freq10 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.effector_translation |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.action |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
oxe_bridge: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.language.language_instruction |
|
lapa_action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- lapa_action |
|
agibot: |
|
video: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 16 |
|
modality_keys: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
state: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
action: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
- 1 |
|
- 2 |
|
- 3 |
|
- 4 |
|
- 5 |
|
- 6 |
|
- 7 |
|
- 8 |
|
- 9 |
|
- 10 |
|
- 11 |
|
- 12 |
|
- 13 |
|
- 14 |
|
- 15 |
|
modality_keys: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
language: |
|
_target_: gr00t.data.dataset.ModalityConfig |
|
delta_indices: |
|
- 0 |
|
modality_keys: |
|
- annotation.agibot.task_description |
|
transforms: |
|
robocasa_gr1_arms_only_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_gr1_arms_waist_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
state.waist: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
action.waist: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
normalization_modes: |
|
state.left_arm: min_max |
|
state.right_arm: min_max |
|
state.left_hand: min_max |
|
state.right_hand: min_max |
|
state.waist: min_max |
|
state.neck: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
normalization_modes: |
|
action.right_arm: min_max |
|
action.left_arm: min_max |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
action.waist: min_max |
|
action.neck: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- state.neck |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- action.neck |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_bimanual_panda_parallel_gripper: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
normalization_modes: |
|
state.right_arm_eef_pos: min_max |
|
state.right_gripper_qpos: min_max |
|
state.left_arm_eef_pos: min_max |
|
state.left_gripper_qpos: min_max |
|
target_rotations: |
|
state.right_arm_eef_quat: rotation_6d |
|
state.left_arm_eef_quat: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
normalization_modes: |
|
action.right_gripper_close: binary |
|
action.left_gripper_close: binary |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_gripper_qpos |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_gripper_qpos |
|
action_concat_order: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_gripper_close |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_gripper_close |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_bimanual_panda_inspire_hand: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
normalization_modes: |
|
state.right_arm_eef_pos: min_max |
|
state.right_hand: min_max |
|
state.left_arm_eef_pos: min_max |
|
state.left_hand: min_max |
|
target_rotations: |
|
state.right_arm_eef_quat: rotation_6d |
|
state.left_arm_eef_quat: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
normalization_modes: |
|
action.right_hand: min_max |
|
action.left_hand: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.robot0_eye_in_hand_pad_res256_freq20 |
|
- video.robot1_eye_in_hand_pad_res256_freq20 |
|
- video.agentview_pad_res256_freq20 |
|
state_concat_order: |
|
- state.right_arm_eef_pos |
|
- state.right_arm_eef_quat |
|
- state.right_hand |
|
- state.left_arm_eef_pos |
|
- state.left_arm_eef_quat |
|
- state.left_hand |
|
action_concat_order: |
|
- action.right_arm_eef_pos |
|
- action.right_arm_eef_rot |
|
- action.right_hand |
|
- action.left_arm_eef_pos |
|
- action.left_arm_eef_rot |
|
- action.left_hand |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
robocasa_panda_omron: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
normalization_modes: |
|
state.end_effector_position_relative: min_max |
|
state.end_effector_rotation_relative: min_max |
|
state.gripper_qpos: min_max |
|
state.base_position: min_max |
|
state.base_rotation: min_max |
|
target_rotations: |
|
state.end_effector_rotation_relative: rotation_6d |
|
state.base_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
normalization_modes: |
|
action.end_effector_position: min_max |
|
action.end_effector_rotation: min_max |
|
action.gripper_close: binary |
|
action.base_motion: min_max |
|
action.control_mode: binary |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.left_view |
|
- video.right_view |
|
- video.wrist_view |
|
state_concat_order: |
|
- state.end_effector_position_relative |
|
- state.end_effector_rotation_relative |
|
- state.gripper_qpos |
|
- state.base_position |
|
- state.base_rotation |
|
action_concat_order: |
|
- action.end_effector_position |
|
- action.end_effector_rotation |
|
- action.gripper_close |
|
- action.base_motion |
|
- action.control_mode |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
gr1_unified: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.ego_view_pad_res256_freq20 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
normalization_modes: |
|
state.left_arm: scale |
|
state.right_arm: scale |
|
state.left_hand: scale |
|
state.right_hand: scale |
|
state.waist: scale |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
normalization_modes: |
|
action.left_arm: scale |
|
action.right_arm: scale |
|
action.left_hand: scale |
|
action.right_hand: scale |
|
action.waist: scale |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.ego_view_pad_res256_freq20 |
|
state_concat_order: |
|
- state.left_arm |
|
- state.right_arm |
|
- state.left_hand |
|
- state.right_hand |
|
- state.waist |
|
action_concat_order: |
|
- action.left_arm |
|
- action.right_arm |
|
- action.left_hand |
|
- action.right_hand |
|
- action.waist |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_droid: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_position: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.eef_rotation_delta: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.exterior_image_1_left_pad_res256_freq15 |
|
- video.exterior_image_2_left_pad_res256_freq15 |
|
- video.wrist_image_left_pad_res256_freq15 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_position |
|
action_concat_order: |
|
- action.eef_position_delta |
|
- action.eef_rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_fractal: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.image_pad_res256_freq03 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_closedness_commanded: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.rotation_delta: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.image_pad_res256_freq03 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closedness_commanded |
|
action_concat_order: |
|
- action.world_vector |
|
- action.rotation_delta |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_language_table: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.rgb_pad_res256_freq10 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.effector_translation |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.effector_translation |
|
normalization_modes: |
|
state.effector_translation: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.action |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.action |
|
normalization_modes: |
|
action.action: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.rgb_pad_res256_freq10 |
|
state_concat_order: |
|
- state.effector_translation |
|
action_concat_order: |
|
- action.action |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
oxe_bridge: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
normalization_modes: |
|
state.eef_position: min_max |
|
state.gripper_closed: min_max |
|
target_rotations: |
|
state.eef_rotation: rotation_6d |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
normalization_modes: |
|
action.gripper_position: binary |
|
target_rotations: |
|
action.eef_rotation: axis_angle |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.image_0 |
|
- video.image_1 |
|
- video.image_2 |
|
state_concat_order: |
|
- state.eef_position |
|
- state.eef_rotation |
|
- state.gripper_closed |
|
action_concat_order: |
|
- action.eef_position |
|
- action.eef_rotation |
|
- action.gripper_position |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
agibot: |
|
_target_: gr00t.data.transform.ComposedModalityTransform |
|
transforms: |
|
- _target_: gr00t.data.transform.VideoToTensor |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
- _target_: gr00t.data.transform.VideoCrop |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
scale: 0.95 |
|
mode: random |
|
- _target_: gr00t.data.transform.VideoResize |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
height: 224 |
|
width: 224 |
|
interpolation: linear |
|
- _target_: gr00t.data.transform.VideoColorJitter |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
brightness: 0.3 |
|
contrast: 0.4 |
|
saturation: 0.5 |
|
hue: 0.08 |
|
- _target_: gr00t.data.transform.VideoToNumpy |
|
apply_to: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
normalization_modes: |
|
state.left_arm_joint_position: min_max |
|
state.right_arm_joint_position: min_max |
|
state.left_effector_position: min_max |
|
state.right_effector_position: min_max |
|
state.head_position: min_max |
|
state.waist_position: min_max |
|
- _target_: gr00t.data.transform.StateActionToTensor |
|
apply_to: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
- _target_: gr00t.data.transform.StateActionTransform |
|
apply_to: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
normalization_modes: |
|
action.left_arm_joint_position: min_max |
|
action.right_arm_joint_position: min_max |
|
action.left_effector_position: min_max |
|
action.right_effector_position: min_max |
|
action.head_position: min_max |
|
action.waist_position: min_max |
|
action.robot_velocity: min_max |
|
- _target_: gr00t.data.transform.ConcatTransform |
|
video_concat_order: |
|
- video.top_head |
|
- video.hand_left |
|
- video.hand_right |
|
state_concat_order: |
|
- state.left_arm_joint_position |
|
- state.right_arm_joint_position |
|
- state.left_effector_position |
|
- state.right_effector_position |
|
- state.head_position |
|
- state.waist_position |
|
action_concat_order: |
|
- action.left_arm_joint_position |
|
- action.right_arm_joint_position |
|
- action.left_effector_position |
|
- action.right_effector_position |
|
- action.head_position |
|
- action.waist_position |
|
- action.robot_velocity |
|
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform |
|
default_instruction: Perform the default behavior. |
|
num_visual_tokens_per_frame: 16 |
|
max_num_images_per_sequence: 6 |
|
max_action_dim: 32 |
|
max_sequence_length: 112 |
|
action_horizon: 16 |
|
siglip_processor: |
|
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained |
|
_convert_: object |
|
pretrained_model_name_or_path: google/siglip2-large-patch16-256 |
|
embodiment_tag_mapping: |
|
real_gr1_arms_only: 0 |
|
real_gr1_arms_only_annotated: 1 |
|
real_gr1_arms_waist: 2 |
|
real_gr1_arms_waist_annotated: 3 |
|
dexmg_gr1_arms_only_inspire: 4 |
|
dexmg_gr1_arms_only_fourier: 5 |
|
dexmg_gr1_arms_waist_fourier: 6 |
|
robocasa_single_arm: 7 |
|
onex_eve_gripper: 8 |
|
robocasa_gr1_arms_only_inspire_hands: 9 |
|
robocasa_gr1_arms_only_fourier_hands: 10 |
|
robocasa_gr1_fixed_lower_body_inspire_hands: 11 |
|
robocasa_gr1_fixed_lower_body_fourier_hands: 12 |
|
robocasa_panda_omron: 13 |
|
robocasa_single_arm_panda_omron: 14 |
|
robocasa_bimanual_panda_parallel_gripper: 15 |
|
robocasa_bimanual_panda_inspire_hand: 16 |
|
oxe_droid: 17 |
|
oxe_fractal: 18 |
|
oxe_language_table: 19 |
|
oxe_bridge: 20 |
|
real_panda_single_arm: 21 |
|
unknown: 22 |
|
hot3d_hands_only: 23 |
|
gr1_unified: 24 |
|
robocasa_gr1_arms_waist_fourier_hands: 25 |
|
agibot: 26 |
|
lapa: 27 |
|
oxe_mutex: 28 |
|
oxe_roboset: 29 |
|
oxe_plex: 30 |
|
dream: 31 |
|
metadata_versions: |
|
robocasa_gr1_arms_only_fourier_hands: '0217' |
|
robocasa_gr1_fixed_lower_body_fourier_hands: '0217' |
|
robocasa_bimanual_panda_parallel_gripper: '0217' |
|
robocasa_bimanual_panda_inspire_hand: '0217' |
|
robocasa_panda_omron: '0217' |
|
gr1_unified: '0225' |
|
oxe_droid: '0221' |
|
oxe_fractal: '0221' |
|
oxe_language_table: '0221' |
|
oxe_bridge: '0221' |
|
robocasa_gr1_arms_waist_fourier_hands: '0225' |
|
agibot: '0225' |
|
dataset_path: ??? |
|
max_state_dim: 44 |
|
mixture_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec |
|
single_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset |
|
data_root: /mnt/amlfs-02/shared/datasets |
|
gr00t_commit_hash: 16d97a65f0541e14efa958455542c5ae3ad9607f |
|
total_training_steps: 163840000000 |
|
|