seonghyeonye's picture
Upload folder using huggingface_hub
3fa7e46 verified
model:
_target_: gr00t.model.idm.IDM
_convert_: object
config:
_target_: gr00t.model.idm.IDMConfig
_recursive_: false
model_dtype: float32
hidden_size: 0
action_horizon: 16
action_dim: 32
backbone_cfg:
_target_: gr00t.model.backbone.IdentityBackbone
action_head_cfg:
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM
_convert_: object
config:
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig
_recursive_: false
add_seperator_token: true
add_pos_embed: true
model_dtype: float32
mm_vision_select_layer: -2
max_state_dim: 44
max_action_dim: 32
hidden_size: 1024
tune_vision_tower: true
add_view_embed: true
max_num_views: 6
siglip_model_cfg:
_target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
siglip_hidden_size: 1024
vl_self_attention_cfg:
_target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer
positional_embeddings: null
num_layers: 4
num_attention_heads: 16
attention_head_dim: 64
dropout: 0.2
final_dropout: true
diffusion_model_cfg:
_target_: gr00t.model.action_head.cross_attention_dit.DiT
positional_embeddings: null
num_layers: 8
num_attention_heads: 16
attention_head_dim: 64
norm_type: ada_norm
dropout: 0.2
final_dropout: true
output_dim: 1024
interleave_self_attention: true
mm_projector_cfg:
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector
_convert_: object
config:
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig
hidden_size: 1024
mm_hidden_size: 1024
mm_projector_type: mlp_doubledownsample
action_dim: 32
action_horizon: 16
num_inference_timesteps: 16
noise_beta_alpha: 1.5
noise_beta_beta: 1.0
noise_s: 0.999
num_timestep_buckets: 1000
backbone_features_projector_cfg: null
train_dataset:
_target_: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
_convert_: object
mixture_spec:
- dataset_path:
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseDoubleDoor256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseDrawer256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseSingleDoor256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeePressButton256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeeServeMug256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeeSetupMug256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenDoubleDoor256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenDrawer256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenSingleDoor256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCabToCounter256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToCab256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToMicrowave256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToSink256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToStove256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPMicrowaveToCounter256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPSinkToCounter256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPStoveToCounter256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffMicrowave256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffSinkFaucet256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffStove256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnMicrowave256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnSinkFaucet256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnStove256_300
- /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnSinkSpout256_300
dataset_weight: 1.0
dataset_class: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset
all_modality_configs:
robocasa_gr1_arms_only_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_gr1_arms_waist_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_gr1_fixed_lower_body_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_bimanual_panda_parallel_gripper:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_bimanual_panda_inspire_hand:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_panda_omron:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.left_view
- video.right_view
- video.wrist_view
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
gr1_unified:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.coarse_action
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_droid:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_position
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_fractal:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.image_pad_res256_freq03
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.world_vector
- action.rotation_delta
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.natural_language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_language_table:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.rgb_pad_res256_freq10
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.effector_translation
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.action
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_bridge:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.image_0
- video.image_1
- video.image_2
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.eef_position
- action.eef_rotation
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
agibot:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.top_head
- video.hand_left
- video.hand_right
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.agibot.task_description
all_transforms:
robocasa_gr1_arms_only_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_gr1_arms_waist_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
state.waist: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
action.waist: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_gr1_fixed_lower_body_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
state.waist: min_max
state.neck: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
action.waist: min_max
action.neck: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.agentview_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_bimanual_panda_parallel_gripper:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
normalization_modes:
state.right_arm_eef_pos: min_max
state.right_gripper_qpos: min_max
state.left_arm_eef_pos: min_max
state.left_gripper_qpos: min_max
target_rotations:
state.right_arm_eef_quat: rotation_6d
state.left_arm_eef_quat: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
normalization_modes:
action.right_gripper_close: binary
action.left_gripper_close: binary
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state_concat_order:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
action_concat_order:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_bimanual_panda_inspire_hand:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
normalization_modes:
state.right_arm_eef_pos: min_max
state.right_hand: min_max
state.left_arm_eef_pos: min_max
state.left_hand: min_max
target_rotations:
state.right_arm_eef_quat: rotation_6d
state.left_arm_eef_quat: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
normalization_modes:
action.right_hand: min_max
action.left_hand: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state_concat_order:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
action_concat_order:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_panda_omron:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
normalization_modes:
state.end_effector_position_relative: min_max
state.end_effector_rotation_relative: min_max
state.gripper_qpos: min_max
state.base_position: min_max
state.base_rotation: min_max
target_rotations:
state.end_effector_rotation_relative: rotation_6d
state.base_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
normalization_modes:
action.end_effector_position: min_max
action.end_effector_rotation: min_max
action.gripper_close: binary
action.base_motion: min_max
action.control_mode: binary
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.left_view
- video.right_view
- video.wrist_view
state_concat_order:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
action_concat_order:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
gr1_unified:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
normalization_modes:
state.left_arm: scale
state.right_arm: scale
state.left_hand: scale
state.right_hand: scale
state.waist: scale
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
normalization_modes:
action.left_arm: scale
action.right_arm: scale
action.left_hand: scale
action.right_hand: scale
action.waist: scale
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_droid:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_position
normalization_modes:
state.eef_position: min_max
state.gripper_position: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.eef_rotation_delta: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_position
action_concat_order:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_fractal:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.image_pad_res256_freq03
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.image_pad_res256_freq03
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.image_pad_res256_freq03
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.image_pad_res256_freq03
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.image_pad_res256_freq03
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
normalization_modes:
state.eef_position: min_max
state.gripper_closedness_commanded: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.world_vector
- action.rotation_delta
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.world_vector
- action.rotation_delta
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.rotation_delta: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.image_pad_res256_freq03
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
action_concat_order:
- action.world_vector
- action.rotation_delta
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_language_table:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.rgb_pad_res256_freq10
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.rgb_pad_res256_freq10
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.rgb_pad_res256_freq10
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.rgb_pad_res256_freq10
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.rgb_pad_res256_freq10
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.effector_translation
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.effector_translation
normalization_modes:
state.effector_translation: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.action
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.action
normalization_modes:
action.action: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.rgb_pad_res256_freq10
state_concat_order:
- state.effector_translation
action_concat_order:
- action.action
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_bridge:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.image_0
- video.image_1
- video.image_2
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.image_0
- video.image_1
- video.image_2
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.image_0
- video.image_1
- video.image_2
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.image_0
- video.image_1
- video.image_2
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.image_0
- video.image_1
- video.image_2
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
normalization_modes:
state.eef_position: min_max
state.gripper_closed: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.eef_position
- action.eef_rotation
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.eef_position
- action.eef_rotation
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.eef_rotation: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.image_0
- video.image_1
- video.image_2
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
action_concat_order:
- action.eef_position
- action.eef_rotation
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
agibot:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
normalization_modes:
state.left_arm_joint_position: min_max
state.right_arm_joint_position: min_max
state.left_effector_position: min_max
state.right_effector_position: min_max
state.head_position: min_max
state.waist_position: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
normalization_modes:
action.left_arm_joint_position: min_max
action.right_arm_joint_position: min_max
action.left_effector_position: min_max
action.right_effector_position: min_max
action.head_position: min_max
action.waist_position: min_max
action.robot_velocity: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.top_head
- video.hand_left
- video.hand_right
state_concat_order:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
action_concat_order:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
metadata_versions:
robocasa_gr1_arms_only_fourier_hands: '0217'
robocasa_gr1_fixed_lower_body_fourier_hands: '0217'
robocasa_bimanual_panda_parallel_gripper: '0217'
robocasa_bimanual_panda_inspire_hand: '0217'
robocasa_panda_omron: '0217'
gr1_unified: '0225'
oxe_droid: '0221'
oxe_fractal: '0221'
oxe_language_table: '0221'
oxe_bridge: '0221'
robocasa_gr1_arms_waist_fourier_hands: '0225'
agibot: '0225'
dataset_kwargs:
video_backend: decord
mixture_kwargs:
training: true
balance_dataset_weights: true
seed: 42
trainer:
_target_: gr00t.experiment.dual_brain.experiment.DualBrainTrainer
_partial_: true
_recursive_: false
callbacks: null
model: ???
train_dataset: ???
compute_dtype: ???
benchmark_time: false
enable_profiling: false
profiling_steps: 5
wandb_project: dream_idm
output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_idm_24P_300
load_from_yaml: null
gear_credentials: /mnt/amlfs-01/home/seonghyeony/.gear/data_credentials
upload_checkpoints: false
upload_every: 10000
upload_last_n_checkpoints: 5
remove_unused_columns: false
bf16: true
tf32: true
global_batch_size: 1024
raise_error_if_global_batch_size_not_set: true
per_device_train_batch_size: 32
per_device_eval_batch_size: 64
gradient_accumulation_steps: 1
dataloader_num_workers: 6
dataloader_pin_memory: false
dataloader_persistent_workers: true
optim: adamw_torch
learning_rate: 0.0001
adam_beta1: 0.95
adam_beta2: 0.999
adam_epsilon: 1.0e-08
weight_decay: 1.0e-05
lr_scheduler_type: cosine
warmup_ratio: 0.05
logging_steps: 10.0
num_train_epochs: 1000
max_steps: 60000
save_strategy: steps
save_steps: 1000
eval_strategy: 'no'
save_total_limit: 20
report_to: wandb
seed: 42
do_eval: false
gradient_checkpointing: false
ddp_find_unused_parameters: false
ddp_bucket_cap_mb: 100
ray_num_workers: 32
eval_bf16: true
torch_compile_mode: null
pretrained_model_path: null
only_tune_projectors: false
training_args:
_target_: transformers.TrainingArguments
output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_idm_24P_300
run_name: gr00t_s_idm_24P_300
remove_unused_columns: false
deepspeed: gr00t/gr00t/experiment/dual_brain/configs/deepspeed/zero2.json
gradient_checkpointing: false
bf16: true
tf32: true
per_device_train_batch_size: 32
per_device_eval_batch_size: 64
gradient_accumulation_steps: 1
dataloader_num_workers: 6
dataloader_pin_memory: false
dataloader_persistent_workers: true
optim: adamw_torch
adam_beta1: 0.95
adam_beta2: 0.999
adam_epsilon: 1.0e-08
learning_rate: 0.0001
weight_decay: 1.0e-05
warmup_ratio: 0.05
lr_scheduler_type: cosine
logging_steps: 10.0
num_train_epochs: 1000
max_steps: 60000
save_strategy: steps
save_steps: 1000
save_total_limit: 20
report_to: wandb
seed: 42
do_eval: false
ddp_find_unused_parameters: false
ddp_bucket_cap_mb: 100
torch_compile_mode: null
add_seperator_token: true
add_pos_embed: true
hidden_size: 1024
attn_dropout: 0.2
siglip_hidden_size: 1024
siglip_version: google/siglip2-large-patch16-256
action_head_cfg:
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM
_convert_: object
config:
_target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig
_recursive_: false
add_seperator_token: true
add_pos_embed: true
model_dtype: float32
mm_vision_select_layer: -2
max_state_dim: 44
max_action_dim: 32
hidden_size: 1024
tune_vision_tower: true
add_view_embed: true
max_num_views: 6
siglip_model_cfg:
_target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
siglip_hidden_size: 1024
vl_self_attention_cfg:
_target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer
positional_embeddings: null
num_layers: 4
num_attention_heads: 16
attention_head_dim: 64
dropout: 0.2
final_dropout: true
diffusion_model_cfg:
_target_: gr00t.model.action_head.cross_attention_dit.DiT
positional_embeddings: null
num_layers: 8
num_attention_heads: 16
attention_head_dim: 64
norm_type: ada_norm
dropout: 0.2
final_dropout: true
output_dim: 1024
interleave_self_attention: true
mm_projector_cfg:
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector
_convert_: object
config:
_target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig
hidden_size: 1024
mm_hidden_size: 1024
mm_projector_type: mlp_doubledownsample
action_dim: 32
action_horizon: 16
num_inference_timesteps: 16
noise_beta_alpha: 1.5
noise_beta_beta: 1.0
noise_s: 0.999
num_timestep_buckets: 1000
backbone_features_projector_cfg: null
backbone_hidden_size: 0
backbone_cfg:
_target_: gr00t.model.backbone.IdentityBackbone
embodiment_tag_to_projector_index:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
num_visual_tokens_per_frame: 16
max_action_dim: 32
language_dropout_prob: 0.0
model_image_resolution: 224
max_sequence_length: 112
model_specific_transform:
_target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
data_collator:
_target_: gr00t.model.transforms_idm.DefaultDataCollatorGR00TIDM
action_horizon: 16
totensor_cfg:
_target_: gr00t.data.transform.VideoToTensor
apply_to: ???
crop_cfg:
_target_: gr00t.data.transform.VideoCrop
apply_to: ???
scale: 0.95
mode: random
resize_cfg:
_target_: gr00t.data.transform.VideoResize
apply_to: ???
height: 224
width: 224
interpolation: linear
color_jitter_cfg:
_target_: gr00t.data.transform.VideoColorJitter
apply_to: ???
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
to_numpy_cfg:
_target_: gr00t.data.transform.VideoToNumpy
apply_to: ???
modality_config_robocasa_gr1_arms_only_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_robocasa_gr1_arms_only_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_robocasa_gr1_arms_waist_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_robocasa_gr1_arms_waist_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
state.waist: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
action.waist: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_robocasa_panda_omron:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.left_view
- video.right_view
- video.wrist_view
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_robocasa_panda_omron:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
normalization_modes:
state.end_effector_position_relative: min_max
state.end_effector_rotation_relative: min_max
state.gripper_qpos: min_max
state.base_position: min_max
state.base_rotation: min_max
target_rotations:
state.end_effector_rotation_relative: rotation_6d
state.base_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
normalization_modes:
action.end_effector_position: min_max
action.end_effector_rotation: min_max
action.gripper_close: binary
action.base_motion: min_max
action.control_mode: binary
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.left_view
- video.right_view
- video.wrist_view
state_concat_order:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
action_concat_order:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_robocasa_gr1_fixed_lower_body_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_robocasa_gr1_fixed_lower_body_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
state.waist: min_max
state.neck: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
action.waist: min_max
action.neck: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.agentview_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_robocasa_bimanual_panda_parallel_gripper:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_robocasa_bimanual_panda_parallel_gripper:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
normalization_modes:
state.right_arm_eef_pos: min_max
state.right_gripper_qpos: min_max
state.left_arm_eef_pos: min_max
state.left_gripper_qpos: min_max
target_rotations:
state.right_arm_eef_quat: rotation_6d
state.left_arm_eef_quat: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
normalization_modes:
action.right_gripper_close: binary
action.left_gripper_close: binary
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state_concat_order:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
action_concat_order:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_robocasa_bimanual_panda_inspire_hand:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_robocasa_bimanual_panda_inspire_hand:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
normalization_modes:
state.right_arm_eef_pos: min_max
state.right_hand: min_max
state.left_arm_eef_pos: min_max
state.left_hand: min_max
target_rotations:
state.right_arm_eef_quat: rotation_6d
state.left_arm_eef_quat: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
normalization_modes:
action.right_hand: min_max
action.left_hand: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state_concat_order:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
action_concat_order:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_gr1_unified:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.coarse_action
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_gr1_unified:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
normalization_modes:
state.left_arm: scale
state.right_arm: scale
state.left_hand: scale
state.right_hand: scale
state.waist: scale
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
normalization_modes:
action.left_arm: scale
action.right_arm: scale
action.left_hand: scale
action.right_hand: scale
action.waist: scale
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_oxe_droid:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_position
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_oxe_droid:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_position
normalization_modes:
state.eef_position: min_max
state.gripper_position: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.eef_rotation_delta: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_position
action_concat_order:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_oxe_fractal:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.image_pad_res256_freq03
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.world_vector
- action.rotation_delta
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.natural_language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_oxe_fractal:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.image_pad_res256_freq03
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.image_pad_res256_freq03
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.image_pad_res256_freq03
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.image_pad_res256_freq03
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.image_pad_res256_freq03
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
normalization_modes:
state.eef_position: min_max
state.gripper_closedness_commanded: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.world_vector
- action.rotation_delta
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.world_vector
- action.rotation_delta
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.rotation_delta: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.image_pad_res256_freq03
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
action_concat_order:
- action.world_vector
- action.rotation_delta
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_oxe_language_table:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.rgb_pad_res256_freq10
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.effector_translation
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.action
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_oxe_language_table:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.rgb_pad_res256_freq10
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.rgb_pad_res256_freq10
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.rgb_pad_res256_freq10
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.rgb_pad_res256_freq10
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.rgb_pad_res256_freq10
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.effector_translation
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.effector_translation
normalization_modes:
state.effector_translation: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.action
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.action
normalization_modes:
action.action: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.rgb_pad_res256_freq10
state_concat_order:
- state.effector_translation
action_concat_order:
- action.action
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_oxe_bridge:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.image_0
- video.image_1
- video.image_2
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.eef_position
- action.eef_rotation
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
transform_oxe_bridge:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.image_0
- video.image_1
- video.image_2
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.image_0
- video.image_1
- video.image_2
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.image_0
- video.image_1
- video.image_2
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.image_0
- video.image_1
- video.image_2
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.image_0
- video.image_1
- video.image_2
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
normalization_modes:
state.eef_position: min_max
state.gripper_closed: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.eef_position
- action.eef_rotation
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.eef_position
- action.eef_rotation
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.eef_rotation: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.image_0
- video.image_1
- video.image_2
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
action_concat_order:
- action.eef_position
- action.eef_rotation
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_config_agibot:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.top_head
- video.hand_left
- video.hand_right
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.agibot.task_description
transform_agibot:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
normalization_modes:
state.left_arm_joint_position: min_max
state.right_arm_joint_position: min_max
state.left_effector_position: min_max
state.right_effector_position: min_max
state.head_position: min_max
state.waist_position: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
normalization_modes:
action.left_arm_joint_position: min_max
action.right_arm_joint_position: min_max
action.left_effector_position: min_max
action.right_effector_position: min_max
action.head_position: min_max
action.waist_position: min_max
action.robot_velocity: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.top_head
- video.hand_left
- video.hand_right
state_concat_order:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
action_concat_order:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
modality_configs:
robocasa_gr1_arms_only_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_gr1_arms_waist_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_gr1_fixed_lower_body_fourier_hands:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_bimanual_panda_parallel_gripper:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_bimanual_panda_inspire_hand:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
robocasa_panda_omron:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.left_view
- video.right_view
- video.wrist_view
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.action.task_description
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
gr1_unified:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.ego_view_pad_res256_freq20
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.human.coarse_action
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_droid:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_position
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_fractal:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.image_pad_res256_freq03
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.world_vector
- action.rotation_delta
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.natural_language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_language_table:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.rgb_pad_res256_freq10
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.effector_translation
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.action
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
oxe_bridge:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.image_0
- video.image_1
- video.image_2
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.eef_position
- action.eef_rotation
- action.gripper_position
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.language.language_instruction
lapa_action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- lapa_action
agibot:
video:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 16
modality_keys:
- video.top_head
- video.hand_left
- video.hand_right
state:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
action:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
modality_keys:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
language:
_target_: gr00t.data.dataset.ModalityConfig
delta_indices:
- 0
modality_keys:
- annotation.agibot.task_description
transforms:
robocasa_gr1_arms_only_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_gr1_arms_waist_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
state.waist: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
action.waist: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_gr1_fixed_lower_body_fourier_hands:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
normalization_modes:
state.left_arm: min_max
state.right_arm: min_max
state.left_hand: min_max
state.right_hand: min_max
state.waist: min_max
state.neck: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
normalization_modes:
action.right_arm: min_max
action.left_arm: min_max
action.right_hand: min_max
action.left_hand: min_max
action.waist: min_max
action.neck: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.agentview_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- state.neck
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- action.neck
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_bimanual_panda_parallel_gripper:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
normalization_modes:
state.right_arm_eef_pos: min_max
state.right_gripper_qpos: min_max
state.left_arm_eef_pos: min_max
state.left_gripper_qpos: min_max
target_rotations:
state.right_arm_eef_quat: rotation_6d
state.left_arm_eef_quat: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
normalization_modes:
action.right_gripper_close: binary
action.left_gripper_close: binary
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state_concat_order:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_gripper_qpos
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_gripper_qpos
action_concat_order:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_gripper_close
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_gripper_close
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_bimanual_panda_inspire_hand:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
normalization_modes:
state.right_arm_eef_pos: min_max
state.right_hand: min_max
state.left_arm_eef_pos: min_max
state.left_hand: min_max
target_rotations:
state.right_arm_eef_quat: rotation_6d
state.left_arm_eef_quat: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
normalization_modes:
action.right_hand: min_max
action.left_hand: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.robot0_eye_in_hand_pad_res256_freq20
- video.robot1_eye_in_hand_pad_res256_freq20
- video.agentview_pad_res256_freq20
state_concat_order:
- state.right_arm_eef_pos
- state.right_arm_eef_quat
- state.right_hand
- state.left_arm_eef_pos
- state.left_arm_eef_quat
- state.left_hand
action_concat_order:
- action.right_arm_eef_pos
- action.right_arm_eef_rot
- action.right_hand
- action.left_arm_eef_pos
- action.left_arm_eef_rot
- action.left_hand
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
robocasa_panda_omron:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.left_view
- video.right_view
- video.wrist_view
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
normalization_modes:
state.end_effector_position_relative: min_max
state.end_effector_rotation_relative: min_max
state.gripper_qpos: min_max
state.base_position: min_max
state.base_rotation: min_max
target_rotations:
state.end_effector_rotation_relative: rotation_6d
state.base_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
normalization_modes:
action.end_effector_position: min_max
action.end_effector_rotation: min_max
action.gripper_close: binary
action.base_motion: min_max
action.control_mode: binary
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.left_view
- video.right_view
- video.wrist_view
state_concat_order:
- state.end_effector_position_relative
- state.end_effector_rotation_relative
- state.gripper_qpos
- state.base_position
- state.base_rotation
action_concat_order:
- action.end_effector_position
- action.end_effector_rotation
- action.gripper_close
- action.base_motion
- action.control_mode
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
gr1_unified:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.ego_view_pad_res256_freq20
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.ego_view_pad_res256_freq20
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.ego_view_pad_res256_freq20
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.ego_view_pad_res256_freq20
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
normalization_modes:
state.left_arm: scale
state.right_arm: scale
state.left_hand: scale
state.right_hand: scale
state.waist: scale
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
normalization_modes:
action.left_arm: scale
action.right_arm: scale
action.left_hand: scale
action.right_hand: scale
action.waist: scale
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.ego_view_pad_res256_freq20
state_concat_order:
- state.left_arm
- state.right_arm
- state.left_hand
- state.right_hand
- state.waist
action_concat_order:
- action.left_arm
- action.right_arm
- action.left_hand
- action.right_hand
- action.waist
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_droid:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_position
normalization_modes:
state.eef_position: min_max
state.gripper_position: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.eef_rotation_delta: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.exterior_image_1_left_pad_res256_freq15
- video.exterior_image_2_left_pad_res256_freq15
- video.wrist_image_left_pad_res256_freq15
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_position
action_concat_order:
- action.eef_position_delta
- action.eef_rotation_delta
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_fractal:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.image_pad_res256_freq03
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.image_pad_res256_freq03
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.image_pad_res256_freq03
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.image_pad_res256_freq03
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.image_pad_res256_freq03
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
normalization_modes:
state.eef_position: min_max
state.gripper_closedness_commanded: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.world_vector
- action.rotation_delta
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.world_vector
- action.rotation_delta
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.rotation_delta: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.image_pad_res256_freq03
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_closedness_commanded
action_concat_order:
- action.world_vector
- action.rotation_delta
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_language_table:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.rgb_pad_res256_freq10
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.rgb_pad_res256_freq10
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.rgb_pad_res256_freq10
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.rgb_pad_res256_freq10
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.rgb_pad_res256_freq10
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.effector_translation
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.effector_translation
normalization_modes:
state.effector_translation: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.action
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.action
normalization_modes:
action.action: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.rgb_pad_res256_freq10
state_concat_order:
- state.effector_translation
action_concat_order:
- action.action
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
oxe_bridge:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.image_0
- video.image_1
- video.image_2
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.image_0
- video.image_1
- video.image_2
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.image_0
- video.image_1
- video.image_2
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.image_0
- video.image_1
- video.image_2
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.image_0
- video.image_1
- video.image_2
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
normalization_modes:
state.eef_position: min_max
state.gripper_closed: min_max
target_rotations:
state.eef_rotation: rotation_6d
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.eef_position
- action.eef_rotation
- action.gripper_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.eef_position
- action.eef_rotation
- action.gripper_position
normalization_modes:
action.gripper_position: binary
target_rotations:
action.eef_rotation: axis_angle
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.image_0
- video.image_1
- video.image_2
state_concat_order:
- state.eef_position
- state.eef_rotation
- state.gripper_closed
action_concat_order:
- action.eef_position
- action.eef_rotation
- action.gripper_position
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
agibot:
_target_: gr00t.data.transform.ComposedModalityTransform
transforms:
- _target_: gr00t.data.transform.VideoToTensor
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
- _target_: gr00t.data.transform.VideoCrop
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
scale: 0.95
mode: random
- _target_: gr00t.data.transform.VideoResize
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
height: 224
width: 224
interpolation: linear
- _target_: gr00t.data.transform.VideoColorJitter
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
brightness: 0.3
contrast: 0.4
saturation: 0.5
hue: 0.08
- _target_: gr00t.data.transform.VideoToNumpy
apply_to:
- video.top_head
- video.hand_left
- video.hand_right
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
normalization_modes:
state.left_arm_joint_position: min_max
state.right_arm_joint_position: min_max
state.left_effector_position: min_max
state.right_effector_position: min_max
state.head_position: min_max
state.waist_position: min_max
- _target_: gr00t.data.transform.StateActionToTensor
apply_to:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
- _target_: gr00t.data.transform.StateActionTransform
apply_to:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
normalization_modes:
action.left_arm_joint_position: min_max
action.right_arm_joint_position: min_max
action.left_effector_position: min_max
action.right_effector_position: min_max
action.head_position: min_max
action.waist_position: min_max
action.robot_velocity: min_max
- _target_: gr00t.data.transform.ConcatTransform
video_concat_order:
- video.top_head
- video.hand_left
- video.hand_right
state_concat_order:
- state.left_arm_joint_position
- state.right_arm_joint_position
- state.left_effector_position
- state.right_effector_position
- state.head_position
- state.waist_position
action_concat_order:
- action.left_arm_joint_position
- action.right_arm_joint_position
- action.left_effector_position
- action.right_effector_position
- action.head_position
- action.waist_position
- action.robot_velocity
- _target_: gr00t.model.transforms_idm.GR00TIDMTransform
default_instruction: Perform the default behavior.
num_visual_tokens_per_frame: 16
max_num_images_per_sequence: 6
max_action_dim: 32
max_sequence_length: 112
action_horizon: 16
siglip_processor:
_target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
_convert_: object
pretrained_model_name_or_path: google/siglip2-large-patch16-256
embodiment_tag_mapping:
real_gr1_arms_only: 0
real_gr1_arms_only_annotated: 1
real_gr1_arms_waist: 2
real_gr1_arms_waist_annotated: 3
dexmg_gr1_arms_only_inspire: 4
dexmg_gr1_arms_only_fourier: 5
dexmg_gr1_arms_waist_fourier: 6
robocasa_single_arm: 7
onex_eve_gripper: 8
robocasa_gr1_arms_only_inspire_hands: 9
robocasa_gr1_arms_only_fourier_hands: 10
robocasa_gr1_fixed_lower_body_inspire_hands: 11
robocasa_gr1_fixed_lower_body_fourier_hands: 12
robocasa_panda_omron: 13
robocasa_single_arm_panda_omron: 14
robocasa_bimanual_panda_parallel_gripper: 15
robocasa_bimanual_panda_inspire_hand: 16
oxe_droid: 17
oxe_fractal: 18
oxe_language_table: 19
oxe_bridge: 20
real_panda_single_arm: 21
unknown: 22
hot3d_hands_only: 23
gr1_unified: 24
robocasa_gr1_arms_waist_fourier_hands: 25
agibot: 26
lapa: 27
oxe_mutex: 28
oxe_roboset: 29
oxe_plex: 30
dream: 31
metadata_versions:
robocasa_gr1_arms_only_fourier_hands: '0217'
robocasa_gr1_fixed_lower_body_fourier_hands: '0217'
robocasa_bimanual_panda_parallel_gripper: '0217'
robocasa_bimanual_panda_inspire_hand: '0217'
robocasa_panda_omron: '0217'
gr1_unified: '0225'
oxe_droid: '0221'
oxe_fractal: '0221'
oxe_language_table: '0221'
oxe_bridge: '0221'
robocasa_gr1_arms_waist_fourier_hands: '0225'
agibot: '0225'
dataset_path: ???
max_state_dim: 44
mixture_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
single_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset
data_root: /mnt/amlfs-02/shared/datasets
gr00t_commit_hash: 16d97a65f0541e14efa958455542c5ae3ad9607f
total_training_steps: 163840000000