model:
  _target_: gr00t.model.idm.IDM
  _convert_: object
  config:
    _target_: gr00t.model.idm.IDMConfig
    _recursive_: false
    model_dtype: float32
    hidden_size: 0
    action_horizon: 16
    action_dim: 32
    backbone_cfg:
      _target_: gr00t.model.backbone.IdentityBackbone
    action_head_cfg:
      _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM
      _convert_: object
      config:
        _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig
        _recursive_: false
        add_seperator_token: true
        add_pos_embed: true
        model_dtype: float32
        mm_vision_select_layer: -2
        max_state_dim: 44
        max_action_dim: 32
        hidden_size: 1024
        tune_vision_tower: true
        add_view_embed: true
        max_num_views: 3
        siglip_model_cfg:
          _target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        siglip_hidden_size: 1024
        vl_self_attention_cfg:
          _target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer
          positional_embeddings: null
          num_layers: 4
          num_attention_heads: 16
          attention_head_dim: 64
          dropout: 0.2
          final_dropout: true
        diffusion_model_cfg:
          _target_: gr00t.model.action_head.cross_attention_dit.DiT
          positional_embeddings: null
          num_layers: 8
          num_attention_heads: 16
          attention_head_dim: 64
          norm_type: ada_norm
          dropout: 0.2
          final_dropout: true
          output_dim: 1024
          interleave_self_attention: true
        mm_projector_cfg:
          _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector
          _convert_: object
          config:
            _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig
            hidden_size: 1024
            mm_hidden_size: 1024
            mm_projector_type: mlp_doubledownsample
        action_dim: 32
        action_horizon: 16
        num_inference_timesteps: 16
        noise_beta_alpha: 1.5
        noise_beta_beta: 1.0
        noise_s: 0.999
        num_timestep_buckets: 1000
        backbone_features_projector_cfg: null
train_dataset:
  _target_: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
  _convert_: object
  mixture_spec:
  - dataset_path:
    - /mnt/amlfs-01/home/seonghyeony/data/2RA_idm/gr1_unified.UnzeroedArmsOnlyRemoveStaticSliceBGCropPad256Freq20_train
    - /mnt/amlfs-01/home/seonghyeony/data/2RA_idm/gr1_unified.UnzeroedArmsWaistRemoveStaticSliceBGCropPad256Freq20_train
    dataset_weight: 1.0
  dataset_class: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset
  all_modality_configs:
    robocasa_gr1_arms_only_fourier_hands:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.ego_view_pad_res256_freq20
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.action.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    robocasa_gr1_arms_waist_fourier_hands:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.ego_view_pad_res256_freq20
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.action.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    robocasa_gr1_fixed_lower_body_fourier_hands:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.agentview_pad_res256_freq20
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        - state.neck
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
        - action.neck
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.action.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    robocasa_bimanual_panda_parallel_gripper:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_gripper_qpos
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_gripper_qpos
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_gripper_close
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_gripper_close
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.action.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    robocasa_bimanual_panda_inspire_hand:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_hand
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_hand
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_hand
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_hand
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.action.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    robocasa_panda_omron:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.res256_image_side_0
        - video.res256_image_side_1
        - video.res256_image_wrist_0
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.end_effector_position_relative
        - state.end_effector_rotation_relative
        - state.gripper_qpos
        - state.base_position
        - state.base_rotation
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.end_effector_position
        - action.end_effector_rotation
        - action.gripper_close
        - action.base_motion
        - action.control_mode
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.action.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    gr1_unified:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.ego_view
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.coarse_action
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    oxe_droid:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.exterior_image_1_left_pad_res256_freq15
        - video.exterior_image_2_left_pad_res256_freq15
        - video.wrist_image_left_pad_res256_freq15
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_position
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.eef_position_delta
        - action.eef_rotation_delta
        - action.gripper_position
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.language.language_instruction
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    oxe_fractal:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.image_pad_res256_freq03
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closedness_commanded
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.world_vector
        - action.rotation_delta
        - action.gripper_position
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.language.natural_language_instruction
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    oxe_language_table:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.rgb_pad_res256_freq10
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.effector_translation
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.action
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.language.instruction
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    oxe_bridge:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.image_0
        - video.image_1
        - video.image_2
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closed
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.language.language_instruction
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    oxe_mutex:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.image
        - video.wrist_image
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.joint_angles
        - state.gripper_closed
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.language.language_instruction
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    oxe_plex:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.image
        - video.wrist_image
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.state
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.language.language_instruction
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    oxe_roboset:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.image_left
        - video.image_right
        - video.image_wrist
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.joint_position
        - state.gripper_closed
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.joint_position
        - action.gripper_position
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.language.language_instruction
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    hot3d_hands_only:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.ego_view
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_wrist_position
        - state.left_wrist_rotation
        - state.left_joint_rotation
        - state.right_wrist_position
        - state.right_wrist_rotation
        - state.right_joint_rotation
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.left_wrist_position
        - action.left_wrist_rotation
        - action.left_joint_rotation
        - action.right_wrist_position
        - action.right_wrist_rotation
        - action.right_joint_rotation
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    agibot:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.top_head
        - video.hand_left
        - video.hand_right
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_arm_joint_position
        - state.right_arm_joint_position
        - state.left_effector_position
        - state.right_effector_position
        - state.head_position
        - state.waist_position
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.left_arm_joint_position
        - action.right_arm_joint_position
        - action.left_effector_position
        - action.right_effector_position
        - action.head_position
        - action.waist_position
        - action.robot_velocity
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.agibot.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    lapa:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.ego
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.action.task_description
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    dream:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.ego_view_bg_crop_pad_res256_freq20
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.coarse_action
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
    gr1_unified_segmentation:
      video:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 16
        modality_keys:
        - video.ego_view_bg_crop_pad_res256_freq20
      state:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - action.segmentation_target
        - action.segmentation_target_mask
      language:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.coarse_action
      lapa_action:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - lapa_action
      dream_actions:
        _target_: gr00t.data.dataset.ModalityConfig
        delta_indices:
        - 0
        modality_keys:
        - dream_actions
  all_transforms:
    robocasa_gr1_arms_only_fourier_hands:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.ego_view_pad_res256_freq20
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.ego_view_pad_res256_freq20
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.ego_view_pad_res256_freq20
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.ego_view_pad_res256_freq20
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.ego_view_pad_res256_freq20
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        normalization_modes:
          state.left_arm: min_max
          state.right_arm: min_max
          state.left_hand: min_max
          state.right_hand: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        normalization_modes:
          action.right_arm: min_max
          action.left_arm: min_max
          action.right_hand: min_max
          action.left_hand: min_max
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.ego_view_pad_res256_freq20
        state_concat_order:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        action_concat_order:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    robocasa_gr1_arms_waist_fourier_hands:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.ego_view_pad_res256_freq20
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.ego_view_pad_res256_freq20
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.ego_view_pad_res256_freq20
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.ego_view_pad_res256_freq20
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.ego_view_pad_res256_freq20
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        normalization_modes:
          state.left_arm: min_max
          state.right_arm: min_max
          state.left_hand: min_max
          state.right_hand: min_max
          state.waist: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
        normalization_modes:
          action.right_arm: min_max
          action.left_arm: min_max
          action.right_hand: min_max
          action.left_hand: min_max
          action.waist: min_max
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.ego_view_pad_res256_freq20
        state_concat_order:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        action_concat_order:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    robocasa_gr1_fixed_lower_body_fourier_hands:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.agentview_pad_res256_freq20
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.agentview_pad_res256_freq20
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.agentview_pad_res256_freq20
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.agentview_pad_res256_freq20
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.agentview_pad_res256_freq20
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        - state.neck
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        - state.neck
        normalization_modes:
          state.left_arm: min_max
          state.right_arm: min_max
          state.left_hand: min_max
          state.right_hand: min_max
          state.waist: min_max
          state.neck: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
        - action.neck
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
        - action.neck
        normalization_modes:
          action.right_arm: min_max
          action.left_arm: min_max
          action.right_hand: min_max
          action.left_hand: min_max
          action.waist: min_max
          action.neck: min_max
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.agentview_pad_res256_freq20
        state_concat_order:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        - state.neck
        action_concat_order:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
        - action.neck
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    robocasa_bimanual_panda_parallel_gripper:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_gripper_qpos
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_gripper_qpos
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_gripper_qpos
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_gripper_qpos
        normalization_modes:
          state.right_arm_eef_pos: min_max
          state.right_gripper_qpos: min_max
          state.left_arm_eef_pos: min_max
          state.left_gripper_qpos: min_max
        target_rotations:
          state.right_arm_eef_quat: rotation_6d
          state.left_arm_eef_quat: rotation_6d
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_gripper_close
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_gripper_close
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_gripper_close
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_gripper_close
        normalization_modes:
          action.right_gripper_close: binary
          action.left_gripper_close: binary
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        state_concat_order:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_gripper_qpos
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_gripper_qpos
        action_concat_order:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_gripper_close
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_gripper_close
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    robocasa_bimanual_panda_inspire_hand:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_hand
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_hand
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_hand
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_hand
        normalization_modes:
          state.right_arm_eef_pos: min_max
          state.right_hand: min_max
          state.left_arm_eef_pos: min_max
          state.left_hand: min_max
        target_rotations:
          state.right_arm_eef_quat: rotation_6d
          state.left_arm_eef_quat: rotation_6d
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_hand
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_hand
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_hand
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_hand
        normalization_modes:
          action.right_hand: min_max
          action.left_hand: min_max
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.robot0_eye_in_hand_pad_res256_freq20
        - video.robot1_eye_in_hand_pad_res256_freq20
        - video.agentview_pad_res256_freq20
        state_concat_order:
        - state.right_arm_eef_pos
        - state.right_arm_eef_quat
        - state.right_hand
        - state.left_arm_eef_pos
        - state.left_arm_eef_quat
        - state.left_hand
        action_concat_order:
        - action.right_arm_eef_pos
        - action.right_arm_eef_rot
        - action.right_hand
        - action.left_arm_eef_pos
        - action.left_arm_eef_rot
        - action.left_hand
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    robocasa_panda_omron:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.res256_image_side_0
        - video.res256_image_side_1
        - video.res256_image_wrist_0
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.res256_image_side_0
        - video.res256_image_side_1
        - video.res256_image_wrist_0
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.res256_image_side_0
        - video.res256_image_side_1
        - video.res256_image_wrist_0
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.res256_image_side_0
        - video.res256_image_side_1
        - video.res256_image_wrist_0
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.res256_image_side_0
        - video.res256_image_side_1
        - video.res256_image_wrist_0
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.end_effector_position_relative
        - state.end_effector_rotation_relative
        - state.gripper_qpos
        - state.base_position
        - state.base_rotation
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.end_effector_position_relative
        - state.end_effector_rotation_relative
        - state.gripper_qpos
        - state.base_position
        - state.base_rotation
        normalization_modes:
          state.end_effector_position_relative: min_max
          state.end_effector_rotation_relative: min_max
          state.gripper_qpos: min_max
          state.base_position: min_max
          state.base_rotation: min_max
        target_rotations:
          state.end_effector_rotation_relative: rotation_6d
          state.base_rotation: rotation_6d
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.end_effector_position
        - action.end_effector_rotation
        - action.gripper_close
        - action.base_motion
        - action.control_mode
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.end_effector_position
        - action.end_effector_rotation
        - action.gripper_close
        - action.base_motion
        - action.control_mode
        normalization_modes:
          action.end_effector_position: min_max
          action.end_effector_rotation: min_max
          action.gripper_close: binary
          action.base_motion: min_max
          action.control_mode: binary
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.res256_image_side_0
        - video.res256_image_side_1
        - video.res256_image_wrist_0
        state_concat_order:
        - state.end_effector_position_relative
        - state.end_effector_rotation_relative
        - state.gripper_qpos
        - state.base_position
        - state.base_rotation
        action_concat_order:
        - action.end_effector_position
        - action.end_effector_rotation
        - action.gripper_close
        - action.base_motion
        - action.control_mode
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    gr1_unified:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.ego_view
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.ego_view
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.ego_view
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.ego_view
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.ego_view
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      - _target_: gr00t.data.transform.StateActionSinCosTransform
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
        normalization_modes:
          action.left_arm: min_max
          action.right_arm: min_max
          action.left_hand: min_max
          action.right_hand: min_max
          action.waist: min_max
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.ego_view
        state_concat_order:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        action_concat_order:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    oxe_droid:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.exterior_image_1_left_pad_res256_freq15
        - video.exterior_image_2_left_pad_res256_freq15
        - video.wrist_image_left_pad_res256_freq15
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.exterior_image_1_left_pad_res256_freq15
        - video.exterior_image_2_left_pad_res256_freq15
        - video.wrist_image_left_pad_res256_freq15
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.exterior_image_1_left_pad_res256_freq15
        - video.exterior_image_2_left_pad_res256_freq15
        - video.wrist_image_left_pad_res256_freq15
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.exterior_image_1_left_pad_res256_freq15
        - video.exterior_image_2_left_pad_res256_freq15
        - video.wrist_image_left_pad_res256_freq15
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.exterior_image_1_left_pad_res256_freq15
        - video.exterior_image_2_left_pad_res256_freq15
        - video.wrist_image_left_pad_res256_freq15
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_position
        normalization_modes:
          state.eef_position: min_max
          state.gripper_position: min_max
        target_rotations:
          state.eef_rotation: rotation_6d
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.eef_position_delta
        - action.eef_rotation_delta
        - action.gripper_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.eef_position_delta
        - action.eef_rotation_delta
        - action.gripper_position
        normalization_modes:
          action.gripper_position: binary
        target_rotations:
          action.eef_rotation_delta: axis_angle
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.exterior_image_1_left_pad_res256_freq15
        - video.exterior_image_2_left_pad_res256_freq15
        - video.wrist_image_left_pad_res256_freq15
        state_concat_order:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_position
        action_concat_order:
        - action.eef_position_delta
        - action.eef_rotation_delta
        - action.gripper_position
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    oxe_fractal:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.image_pad_res256_freq03
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.image_pad_res256_freq03
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.image_pad_res256_freq03
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.image_pad_res256_freq03
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.image_pad_res256_freq03
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closedness_commanded
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closedness_commanded
        normalization_modes:
          state.eef_position: min_max
          state.gripper_closedness_commanded: min_max
        target_rotations:
          state.eef_rotation: rotation_6d
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.world_vector
        - action.rotation_delta
        - action.gripper_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.world_vector
        - action.rotation_delta
        - action.gripper_position
        normalization_modes:
          action.gripper_position: binary
        target_rotations:
          action.rotation_delta: axis_angle
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.image_pad_res256_freq03
        state_concat_order:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closedness_commanded
        action_concat_order:
        - action.world_vector
        - action.rotation_delta
        - action.gripper_position
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    oxe_language_table:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.rgb_pad_res256_freq10
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.rgb_pad_res256_freq10
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.rgb_pad_res256_freq10
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.rgb_pad_res256_freq10
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.rgb_pad_res256_freq10
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.effector_translation
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.effector_translation
        normalization_modes:
          state.effector_translation: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.action
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.action
        normalization_modes:
          action.action: min_max
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.rgb_pad_res256_freq10
        state_concat_order:
        - state.effector_translation
        action_concat_order:
        - action.action
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    oxe_bridge:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.image_0
        - video.image_1
        - video.image_2
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.image_0
        - video.image_1
        - video.image_2
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.image_0
        - video.image_1
        - video.image_2
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.image_0
        - video.image_1
        - video.image_2
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.image_0
        - video.image_1
        - video.image_2
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closed
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closed
        normalization_modes:
          state.eef_position: min_max
          state.gripper_closed: min_max
        target_rotations:
          state.eef_rotation: rotation_6d
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
        normalization_modes:
          action.gripper_position: binary
        target_rotations:
          action.eef_rotation: axis_angle
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.image_0
        - video.image_1
        - video.image_2
        state_concat_order:
        - state.eef_position
        - state.eef_rotation
        - state.gripper_closed
        action_concat_order:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    hot3d_hands_only:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.ego_view
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.ego_view
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.ego_view
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.ego_view
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.ego_view
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_wrist_position
        - state.left_wrist_rotation
        - state.left_joint_rotation
        - state.right_wrist_position
        - state.right_wrist_rotation
        - state.right_joint_rotation
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.left_wrist_position
        - state.left_wrist_rotation
        - state.left_joint_rotation
        - state.right_wrist_position
        - state.right_wrist_rotation
        - state.right_joint_rotation
        normalization_modes:
          state.left_wrist_position: min_max
          state.right_wrist_position: min_max
        target_rotations:
          state.left_wrist_rotation: quaternion
          state.right_wrist_rotation: quaternion
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.left_wrist_position
        - action.left_wrist_rotation
        - action.left_joint_rotation
        - action.right_wrist_position
        - action.right_wrist_rotation
        - action.right_joint_rotation
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.left_wrist_position
        - action.left_wrist_rotation
        - action.left_joint_rotation
        - action.right_wrist_position
        - action.right_wrist_rotation
        - action.right_joint_rotation
        normalization_modes:
          action.left_wrist_position: min_max
          action.right_wrist_position: min_max
        target_rotations:
          action.left_wrist_rotation: quaternion
          action.right_wrist_rotation: quaternion
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.ego_view
        state_concat_order:
        - state.left_wrist_position
        - state.left_wrist_rotation
        - state.left_joint_rotation
        - state.right_wrist_position
        - state.right_wrist_rotation
        - state.right_joint_rotation
        action_concat_order:
        - action.left_wrist_position
        - action.left_wrist_rotation
        - action.left_joint_rotation
        - action.right_wrist_position
        - action.right_wrist_rotation
        - action.right_joint_rotation
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    agibot:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.top_head
        - video.hand_left
        - video.hand_right
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.top_head
        - video.hand_left
        - video.hand_right
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.top_head
        - video.hand_left
        - video.hand_right
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.top_head
        - video.hand_left
        - video.hand_right
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.top_head
        - video.hand_left
        - video.hand_right
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_arm_joint_position
        - state.right_arm_joint_position
        - state.left_effector_position
        - state.right_effector_position
        - state.head_position
        - state.waist_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.left_arm_joint_position
        - state.right_arm_joint_position
        - state.left_effector_position
        - state.right_effector_position
        - state.head_position
        - state.waist_position
        normalization_modes:
          state.left_arm_joint_position: min_max
          state.right_arm_joint_position: min_max
          state.left_effector_position: min_max
          state.right_effector_position: min_max
          state.head_position: min_max
          state.waist_position: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.left_arm_joint_position
        - action.right_arm_joint_position
        - action.left_effector_position
        - action.right_effector_position
        - action.head_position
        - action.waist_position
        - action.robot_velocity
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.left_arm_joint_position
        - action.right_arm_joint_position
        - action.left_effector_position
        - action.right_effector_position
        - action.head_position
        - action.waist_position
        - action.robot_velocity
        normalization_modes:
          action.left_arm_joint_position: min_max
          action.right_arm_joint_position: min_max
          action.left_effector_position: min_max
          action.right_effector_position: min_max
          action.head_position: min_max
          action.waist_position: min_max
          action.robot_velocity: min_max
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.top_head
        - video.hand_left
        - video.hand_right
        state_concat_order:
        - state.left_arm_joint_position
        - state.right_arm_joint_position
        - state.left_effector_position
        - state.right_effector_position
        - state.head_position
        - state.waist_position
        action_concat_order:
        - action.left_arm_joint_position
        - action.right_arm_joint_position
        - action.left_effector_position
        - action.right_effector_position
        - action.head_position
        - action.waist_position
        - action.robot_velocity
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    oxe_mutex:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.image
        - video.wrist_image
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.image
        - video.wrist_image
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.image
        - video.wrist_image
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.image
        - video.wrist_image
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.image
        - video.wrist_image
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.joint_angles
        - state.gripper_closed
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.joint_angles
        - state.gripper_closed
        normalization_modes:
          state.joint_angles: min_max
          state.gripper_closed: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
        normalization_modes:
          action.gripper_position: binary
        target_rotations:
          action.eef_rotation: axis_angle
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.image
        - video.wrist_image
        state_concat_order:
        - state.joint_angles
        - state.gripper_closed
        action_concat_order:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    oxe_plex:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.image
        - video.wrist_image
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.image
        - video.wrist_image
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.image
        - video.wrist_image
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.image
        - video.wrist_image
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.image
        - video.wrist_image
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.state
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.state
        normalization_modes:
          state.state: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
        normalization_modes:
          action.gripper_position: binary
        target_rotations:
          action.eef_rotation: axis_angle
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.image
        - video.wrist_image
        state_concat_order:
        - state.state
        action_concat_order:
        - action.eef_position
        - action.eef_rotation
        - action.gripper_position
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    oxe_roboset:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.image_left
        - video.image_right
        - video.image_wrist
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.image_left
        - video.image_right
        - video.image_wrist
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.image_left
        - video.image_right
        - video.image_wrist
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.image_left
        - video.image_right
        - video.image_wrist
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.image_left
        - video.image_right
        - video.image_wrist
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.joint_position
        - state.gripper_closed
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - state.joint_position
        - state.gripper_closed
        normalization_modes:
          state.joint_position: min_max
          state.gripper_closed: min_max
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.joint_position
        - action.gripper_position
      - _target_: gr00t.data.transform.StateActionTransform
        apply_to:
        - action.joint_position
        - action.gripper_position
        normalization_modes:
          action.joint_position: min_max
          action.gripper_position: binary
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.image_left
        - video.image_right
        - video.image_wrist
        state_concat_order:
        - state.joint_position
        - state.gripper_closed
        action_concat_order:
        - action.joint_position
        - action.gripper_position
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    lapa:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.ego
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.ego
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.ego
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.ego
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.ego
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.ego
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    dream:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
      - _target_: gr00t.data.transform.VideoCrop
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
        scale: 0.95
        mode: random
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.ego_view_bg_crop_pad_res256_freq20
        state_concat_order:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        action_concat_order:
        - action.left_arm
        - action.right_arm
        - action.left_hand
        - action.right_hand
        - action.waist
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
    gr1_unified_segmentation:
      _target_: gr00t.data.transform.ComposedModalityTransform
      transforms:
      - _target_: gr00t.data.transform.VideoToTensor
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
      - _target_: gr00t.data.transform.VideoResize
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
        height: 224
        width: 224
        interpolation: linear
      - _target_: gr00t.data.transform.VideoColorJitter
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.08
      - _target_: gr00t.data.transform.VideoToNumpy
        apply_to:
        - video.ego_view_bg_crop_pad_res256_freq20
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      - _target_: gr00t.data.transform.StateActionSinCosTransform
        apply_to:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
      - _target_: gr00t.data.transform.StateActionToTensor
        apply_to:
        - action.segmentation_target
        - action.segmentation_target_mask
      - _target_: gr00t.data.transform.ConcatTransform
        video_concat_order:
        - video.ego_view_bg_crop_pad_res256_freq20
        state_concat_order:
        - state.left_arm
        - state.right_arm
        - state.left_hand
        - state.right_hand
        - state.waist
        action_concat_order:
        - action.segmentation_target
        - action.segmentation_target_mask
      - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
        default_instruction: Perform the default behavior.
        num_visual_tokens_per_frame: 16
        max_num_images_per_sequence: 6
        max_action_dim: 32
        max_sequence_length: 112
        action_horizon: 16
        siglip_processor:
          _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
          _convert_: object
          pretrained_model_name_or_path: google/siglip2-large-patch16-256
        embodiment_tag_mapping:
          real_gr1_arms_only: 0
          real_gr1_arms_only_annotated: 1
          real_gr1_arms_waist: 2
          real_gr1_arms_waist_annotated: 3
          dexmg_gr1_arms_only_inspire: 4
          dexmg_gr1_arms_only_fourier: 5
          dexmg_gr1_arms_waist_fourier: 6
          robocasa_single_arm: 7
          onex_eve_gripper: 8
          robocasa_gr1_arms_only_inspire_hands: 9
          robocasa_gr1_arms_only_fourier_hands: 10
          robocasa_gr1_fixed_lower_body_inspire_hands: 11
          robocasa_gr1_fixed_lower_body_fourier_hands: 12
          robocasa_panda_omron: 13
          robocasa_bimanual_panda_parallel_gripper: 15
          robocasa_bimanual_panda_inspire_hand: 16
          oxe_droid: 17
          oxe_fractal: 18
          oxe_language_table: 19
          oxe_bridge: 20
          real_panda_single_arm: 21
          unknown: 22
          hot3d_hands_only: 23
          gr1_unified: 24
          robocasa_gr1_arms_waist_fourier_hands: 25
          lapa: 27
          oxe_mutex: 28
          oxe_roboset: 29
          oxe_plex: 30
          dream: 31
          gr1_unified_segmentation: 14
  metadata_versions:
    robocasa_gr1_arms_only_fourier_hands: '0217'
    robocasa_gr1_fixed_lower_body_fourier_hands: '0217'
    robocasa_bimanual_panda_parallel_gripper: '0217'
    robocasa_bimanual_panda_inspire_hand: '0217'
    robocasa_panda_omron: '0217'
    gr1_unified: '0304'
    oxe_droid: '0221'
    oxe_fractal: '0221'
    oxe_language_table: '0221'
    oxe_bridge: '0221'
    robocasa_gr1_arms_waist_fourier_hands: '0225'
    hot3d_hands_only: '0220'
    agibot: '0306'
    oxe_mutex: '0303'
    oxe_plex: '0303'
    oxe_roboset: '0303'
    lapa: '0305'
    dream: '0308'
    gr1_unified_segmentation: '0309'
  dataset_kwargs:
    video_backend: decord
    use_global_metadata: true
  mixture_kwargs:
    training: true
    balance_dataset_weights: true
    seed: 42
    shard_sampling_rate: 0.1
trainer:
  _target_: gr00t.experiment.dual_brain.experiment.DualBrainTrainer
  _partial_: true
  _recursive_: false
  callbacks: null
  model: ???
  train_dataset: ???
  compute_dtype: ???
  benchmark_time: false
  enable_profiling: false
  profiling_steps: 5
wandb_project: dream_idm
output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_gr1_idm_real_global_stats
load_from_yaml: null
gear_credentials: ???
upload_checkpoints: false
upload_every: 10000
upload_last_n_checkpoints: 5
remove_unused_columns: false
bf16: true
tf32: true
global_batch_size: null
raise_error_if_global_batch_size_not_set: false
per_device_train_batch_size: 64
per_device_eval_batch_size: 64
gradient_accumulation_steps: 1
dataloader_num_workers: 6
dataloader_pin_memory: false
dataloader_persistent_workers: true
optim: adamw_torch
learning_rate: 0.0001
adam_beta1: 0.95
adam_beta2: 0.999
adam_epsilon: 1.0e-08
weight_decay: 1.0e-05
lr_scheduler_type: cosine
warmup_ratio: 0.05
logging_steps: 10.0
num_train_epochs: 1000
max_steps: 60000
save_strategy: steps
save_steps: 1000
eval_strategy: 'no'
save_total_limit: 30
report_to: wandb
seed: 21
do_eval: false
gradient_checkpointing: false
ddp_find_unused_parameters: false
ddp_bucket_cap_mb: 100
ray_num_workers: ???
eval_bf16: true
torch_compile_mode: null
pretrained_model_path: null
only_tune_projectors: false
training_args:
  _target_: transformers.TrainingArguments
  output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_gr1_idm_real_global_stats
  run_name: gr00t_s_gr1_idm_real_global_stats
  remove_unused_columns: false
  deepspeed: gr00t/experiment/dual_brain/configs/deepspeed/zero2.json
  gradient_checkpointing: false
  bf16: true
  tf32: true
  per_device_train_batch_size: 64
  per_device_eval_batch_size: 64
  gradient_accumulation_steps: 1
  dataloader_num_workers: 6
  dataloader_pin_memory: false
  dataloader_persistent_workers: true
  optim: adamw_torch
  adam_beta1: 0.95
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  learning_rate: 0.0001
  weight_decay: 1.0e-05
  warmup_ratio: 0.05
  lr_scheduler_type: cosine
  logging_steps: 10.0
  num_train_epochs: 1000
  max_steps: 60000
  save_strategy: steps
  save_steps: 1000
  save_total_limit: 30
  report_to: wandb
  seed: 21
  do_eval: false
  ddp_find_unused_parameters: false
  ddp_bucket_cap_mb: 100
  torch_compile_mode: null
add_seperator_token: true
add_pos_embed: true
hidden_size: 1024
attn_dropout: 0.2
siglip_hidden_size: 1024
siglip_version: google/siglip2-large-patch16-256
action_head_cfg:
  _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM
  _convert_: object
  config:
    _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig
    _recursive_: false
    add_seperator_token: true
    add_pos_embed: true
    model_dtype: float32
    mm_vision_select_layer: -2
    max_state_dim: 44
    max_action_dim: 32
    hidden_size: 1024
    tune_vision_tower: true
    add_view_embed: true
    max_num_views: 3
    siglip_model_cfg:
      _target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    siglip_hidden_size: 1024
    vl_self_attention_cfg:
      _target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer
      positional_embeddings: null
      num_layers: 4
      num_attention_heads: 16
      attention_head_dim: 64
      dropout: 0.2
      final_dropout: true
    diffusion_model_cfg:
      _target_: gr00t.model.action_head.cross_attention_dit.DiT
      positional_embeddings: null
      num_layers: 8
      num_attention_heads: 16
      attention_head_dim: 64
      norm_type: ada_norm
      dropout: 0.2
      final_dropout: true
      output_dim: 1024
      interleave_self_attention: true
    mm_projector_cfg:
      _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector
      _convert_: object
      config:
        _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig
        hidden_size: 1024
        mm_hidden_size: 1024
        mm_projector_type: mlp_doubledownsample
    action_dim: 32
    action_horizon: 16
    num_inference_timesteps: 16
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_timestep_buckets: 1000
    backbone_features_projector_cfg: null
backbone_hidden_size: 0
backbone_cfg:
  _target_: gr00t.model.backbone.IdentityBackbone
embodiment_tag_to_projector_index:
  real_gr1_arms_only: 0
  real_gr1_arms_only_annotated: 1
  real_gr1_arms_waist: 2
  real_gr1_arms_waist_annotated: 3
  dexmg_gr1_arms_only_inspire: 4
  dexmg_gr1_arms_only_fourier: 5
  dexmg_gr1_arms_waist_fourier: 6
  robocasa_single_arm: 7
  onex_eve_gripper: 8
  robocasa_gr1_arms_only_inspire_hands: 9
  robocasa_gr1_arms_only_fourier_hands: 10
  robocasa_gr1_fixed_lower_body_inspire_hands: 11
  robocasa_gr1_fixed_lower_body_fourier_hands: 12
  robocasa_panda_omron: 13
  robocasa_bimanual_panda_parallel_gripper: 15
  robocasa_bimanual_panda_inspire_hand: 16
  oxe_droid: 17
  oxe_fractal: 18
  oxe_language_table: 19
  oxe_bridge: 20
  real_panda_single_arm: 21
  unknown: 22
  hot3d_hands_only: 23
  gr1_unified: 24
  robocasa_gr1_arms_waist_fourier_hands: 25
  lapa: 27
  oxe_mutex: 28
  oxe_roboset: 29
  oxe_plex: 30
  dream: 31
  gr1_unified_segmentation: 14
num_visual_tokens_per_frame: 16
max_action_dim: 32
language_dropout_prob: 0.0
model_image_resolution: 224
max_sequence_length: 112
model_specific_transform:
  _target_: gr00t.model.transforms_idm.GR00TIDMTransform
  default_instruction: Perform the default behavior.
  num_visual_tokens_per_frame: 16
  max_num_images_per_sequence: 6
  max_action_dim: 32
  max_sequence_length: 112
  action_horizon: 16
  siglip_processor:
    _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
    _convert_: object
    pretrained_model_name_or_path: google/siglip2-large-patch16-256
  embodiment_tag_mapping:
    real_gr1_arms_only: 0
    real_gr1_arms_only_annotated: 1
    real_gr1_arms_waist: 2
    real_gr1_arms_waist_annotated: 3
    dexmg_gr1_arms_only_inspire: 4
    dexmg_gr1_arms_only_fourier: 5
    dexmg_gr1_arms_waist_fourier: 6
    robocasa_single_arm: 7
    onex_eve_gripper: 8
    robocasa_gr1_arms_only_inspire_hands: 9
    robocasa_gr1_arms_only_fourier_hands: 10
    robocasa_gr1_fixed_lower_body_inspire_hands: 11
    robocasa_gr1_fixed_lower_body_fourier_hands: 12
    robocasa_panda_omron: 13
    robocasa_bimanual_panda_parallel_gripper: 15
    robocasa_bimanual_panda_inspire_hand: 16
    oxe_droid: 17
    oxe_fractal: 18
    oxe_language_table: 19
    oxe_bridge: 20
    real_panda_single_arm: 21
    unknown: 22
    hot3d_hands_only: 23
    gr1_unified: 24
    robocasa_gr1_arms_waist_fourier_hands: 25
    lapa: 27
    oxe_mutex: 28
    oxe_roboset: 29
    oxe_plex: 30
    dream: 31
    gr1_unified_segmentation: 14
data_collator:
  _target_: gr00t.model.transforms_idm.DefaultDataCollatorGR00TIDM
use_global_metadata: true
action_horizon: 16
state_horizon: 1
image_resolution: 224
totensor_cfg:
  _target_: gr00t.data.transform.VideoToTensor
  apply_to: ???
crop_cfg:
  _target_: gr00t.data.transform.VideoCrop
  apply_to: ???
  scale: 0.95
  mode: random
resize_cfg:
  _target_: gr00t.data.transform.VideoResize
  apply_to: ???
  height: 224
  width: 224
  interpolation: linear
color_jitter_cfg:
  _target_: gr00t.data.transform.VideoColorJitter
  apply_to: ???
  brightness: 0.3
  contrast: 0.4
  saturation: 0.5
  hue: 0.08
random_grayscale_cfg:
  _target_: gr00t.data.transform.VideoRandomGrayscale
  apply_to: ???
  p: 0.1
random_posterize_cfg:
  _target_: gr00t.data.transform.VideoRandomPosterize
  apply_to: ???
  bits: 4
  p: 0.1
to_numpy_cfg:
  _target_: gr00t.data.transform.VideoToNumpy
  apply_to: ???
modality_config_robocasa_gr1_arms_only_fourier_hands:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.ego_view_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.action.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_robocasa_gr1_arms_only_fourier_hands:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.ego_view_pad_res256_freq20
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.ego_view_pad_res256_freq20
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.ego_view_pad_res256_freq20
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.ego_view_pad_res256_freq20
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.ego_view_pad_res256_freq20
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    normalization_modes:
      state.left_arm: min_max
      state.right_arm: min_max
      state.left_hand: min_max
      state.right_hand: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    normalization_modes:
      action.right_arm: min_max
      action.left_arm: min_max
      action.right_hand: min_max
      action.left_hand: min_max
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.ego_view_pad_res256_freq20
    state_concat_order:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    action_concat_order:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_robocasa_gr1_arms_waist_fourier_hands:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.ego_view_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.action.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_robocasa_gr1_arms_waist_fourier_hands:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.ego_view_pad_res256_freq20
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.ego_view_pad_res256_freq20
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.ego_view_pad_res256_freq20
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.ego_view_pad_res256_freq20
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.ego_view_pad_res256_freq20
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    normalization_modes:
      state.left_arm: min_max
      state.right_arm: min_max
      state.left_hand: min_max
      state.right_hand: min_max
      state.waist: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
    normalization_modes:
      action.right_arm: min_max
      action.left_arm: min_max
      action.right_hand: min_max
      action.left_hand: min_max
      action.waist: min_max
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.ego_view_pad_res256_freq20
    state_concat_order:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    action_concat_order:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_robocasa_panda_omron:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.res256_image_side_0
    - video.res256_image_side_1
    - video.res256_image_wrist_0
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.end_effector_position_relative
    - state.end_effector_rotation_relative
    - state.gripper_qpos
    - state.base_position
    - state.base_rotation
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.end_effector_position
    - action.end_effector_rotation
    - action.gripper_close
    - action.base_motion
    - action.control_mode
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.action.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_robocasa_panda_omron:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.res256_image_side_0
    - video.res256_image_side_1
    - video.res256_image_wrist_0
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.res256_image_side_0
    - video.res256_image_side_1
    - video.res256_image_wrist_0
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.res256_image_side_0
    - video.res256_image_side_1
    - video.res256_image_wrist_0
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.res256_image_side_0
    - video.res256_image_side_1
    - video.res256_image_wrist_0
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.res256_image_side_0
    - video.res256_image_side_1
    - video.res256_image_wrist_0
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.end_effector_position_relative
    - state.end_effector_rotation_relative
    - state.gripper_qpos
    - state.base_position
    - state.base_rotation
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.end_effector_position_relative
    - state.end_effector_rotation_relative
    - state.gripper_qpos
    - state.base_position
    - state.base_rotation
    normalization_modes:
      state.end_effector_position_relative: min_max
      state.end_effector_rotation_relative: min_max
      state.gripper_qpos: min_max
      state.base_position: min_max
      state.base_rotation: min_max
    target_rotations:
      state.end_effector_rotation_relative: rotation_6d
      state.base_rotation: rotation_6d
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.end_effector_position
    - action.end_effector_rotation
    - action.gripper_close
    - action.base_motion
    - action.control_mode
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.end_effector_position
    - action.end_effector_rotation
    - action.gripper_close
    - action.base_motion
    - action.control_mode
    normalization_modes:
      action.end_effector_position: min_max
      action.end_effector_rotation: min_max
      action.gripper_close: binary
      action.base_motion: min_max
      action.control_mode: binary
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.res256_image_side_0
    - video.res256_image_side_1
    - video.res256_image_wrist_0
    state_concat_order:
    - state.end_effector_position_relative
    - state.end_effector_rotation_relative
    - state.gripper_qpos
    - state.base_position
    - state.base_rotation
    action_concat_order:
    - action.end_effector_position
    - action.end_effector_rotation
    - action.gripper_close
    - action.base_motion
    - action.control_mode
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_robocasa_gr1_fixed_lower_body_fourier_hands:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.agentview_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    - state.neck
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
    - action.neck
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.action.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_robocasa_gr1_fixed_lower_body_fourier_hands:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.agentview_pad_res256_freq20
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.agentview_pad_res256_freq20
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.agentview_pad_res256_freq20
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.agentview_pad_res256_freq20
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.agentview_pad_res256_freq20
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    - state.neck
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    - state.neck
    normalization_modes:
      state.left_arm: min_max
      state.right_arm: min_max
      state.left_hand: min_max
      state.right_hand: min_max
      state.waist: min_max
      state.neck: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
    - action.neck
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
    - action.neck
    normalization_modes:
      action.right_arm: min_max
      action.left_arm: min_max
      action.right_hand: min_max
      action.left_hand: min_max
      action.waist: min_max
      action.neck: min_max
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.agentview_pad_res256_freq20
    state_concat_order:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    - state.neck
    action_concat_order:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
    - action.neck
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_robocasa_bimanual_panda_parallel_gripper:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_gripper_qpos
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_gripper_qpos
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_gripper_close
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_gripper_close
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.action.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_robocasa_bimanual_panda_parallel_gripper:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_gripper_qpos
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_gripper_qpos
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_gripper_qpos
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_gripper_qpos
    normalization_modes:
      state.right_arm_eef_pos: min_max
      state.right_gripper_qpos: min_max
      state.left_arm_eef_pos: min_max
      state.left_gripper_qpos: min_max
    target_rotations:
      state.right_arm_eef_quat: rotation_6d
      state.left_arm_eef_quat: rotation_6d
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_gripper_close
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_gripper_close
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_gripper_close
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_gripper_close
    normalization_modes:
      action.right_gripper_close: binary
      action.left_gripper_close: binary
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    state_concat_order:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_gripper_qpos
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_gripper_qpos
    action_concat_order:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_gripper_close
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_gripper_close
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_robocasa_bimanual_panda_inspire_hand:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_hand
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_hand
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_hand
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_hand
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.action.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_robocasa_bimanual_panda_inspire_hand:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_hand
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_hand
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_hand
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_hand
    normalization_modes:
      state.right_arm_eef_pos: min_max
      state.right_hand: min_max
      state.left_arm_eef_pos: min_max
      state.left_hand: min_max
    target_rotations:
      state.right_arm_eef_quat: rotation_6d
      state.left_arm_eef_quat: rotation_6d
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_hand
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_hand
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_hand
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_hand
    normalization_modes:
      action.right_hand: min_max
      action.left_hand: min_max
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.robot0_eye_in_hand_pad_res256_freq20
    - video.robot1_eye_in_hand_pad_res256_freq20
    - video.agentview_pad_res256_freq20
    state_concat_order:
    - state.right_arm_eef_pos
    - state.right_arm_eef_quat
    - state.right_hand
    - state.left_arm_eef_pos
    - state.left_arm_eef_quat
    - state.left_hand
    action_concat_order:
    - action.right_arm_eef_pos
    - action.right_arm_eef_rot
    - action.right_hand
    - action.left_arm_eef_pos
    - action.left_arm_eef_rot
    - action.left_hand
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_gr1_unified_segmentation:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.ego_view_bg_crop_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.segmentation_target
    - action.segmentation_target_mask
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.coarse_action
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_gr1_unified_segmentation:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  - _target_: gr00t.data.transform.StateActionSinCosTransform
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.segmentation_target
    - action.segmentation_target_mask
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.ego_view_bg_crop_pad_res256_freq20
    state_concat_order:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    action_concat_order:
    - action.segmentation_target
    - action.segmentation_target_mask
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_gr1_unified:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.ego_view_bg_crop_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.coarse_action
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_gr1_unified:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.ego_view
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.ego_view
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.ego_view
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.ego_view
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.ego_view
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  - _target_: gr00t.data.transform.StateActionSinCosTransform
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
    normalization_modes:
      action.left_arm: min_max
      action.right_arm: min_max
      action.left_hand: min_max
      action.right_hand: min_max
      action.waist: min_max
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.ego_view
    state_concat_order:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    action_concat_order:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_oxe_droid:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.exterior_image_1_left_pad_res256_freq15
    - video.exterior_image_2_left_pad_res256_freq15
    - video.wrist_image_left_pad_res256_freq15
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_position
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.eef_position_delta
    - action.eef_rotation_delta
    - action.gripper_position
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.language.language_instruction
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_oxe_droid:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.exterior_image_1_left_pad_res256_freq15
    - video.exterior_image_2_left_pad_res256_freq15
    - video.wrist_image_left_pad_res256_freq15
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.exterior_image_1_left_pad_res256_freq15
    - video.exterior_image_2_left_pad_res256_freq15
    - video.wrist_image_left_pad_res256_freq15
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.exterior_image_1_left_pad_res256_freq15
    - video.exterior_image_2_left_pad_res256_freq15
    - video.wrist_image_left_pad_res256_freq15
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.exterior_image_1_left_pad_res256_freq15
    - video.exterior_image_2_left_pad_res256_freq15
    - video.wrist_image_left_pad_res256_freq15
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.exterior_image_1_left_pad_res256_freq15
    - video.exterior_image_2_left_pad_res256_freq15
    - video.wrist_image_left_pad_res256_freq15
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_position
    normalization_modes:
      state.eef_position: min_max
      state.gripper_position: min_max
    target_rotations:
      state.eef_rotation: rotation_6d
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.eef_position_delta
    - action.eef_rotation_delta
    - action.gripper_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.eef_position_delta
    - action.eef_rotation_delta
    - action.gripper_position
    normalization_modes:
      action.gripper_position: binary
    target_rotations:
      action.eef_rotation_delta: axis_angle
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.exterior_image_1_left_pad_res256_freq15
    - video.exterior_image_2_left_pad_res256_freq15
    - video.wrist_image_left_pad_res256_freq15
    state_concat_order:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_position
    action_concat_order:
    - action.eef_position_delta
    - action.eef_rotation_delta
    - action.gripper_position
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_oxe_fractal:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.image_pad_res256_freq03
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closedness_commanded
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.world_vector
    - action.rotation_delta
    - action.gripper_position
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.language.natural_language_instruction
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_oxe_fractal:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.image_pad_res256_freq03
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.image_pad_res256_freq03
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.image_pad_res256_freq03
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.image_pad_res256_freq03
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.image_pad_res256_freq03
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closedness_commanded
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closedness_commanded
    normalization_modes:
      state.eef_position: min_max
      state.gripper_closedness_commanded: min_max
    target_rotations:
      state.eef_rotation: rotation_6d
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.world_vector
    - action.rotation_delta
    - action.gripper_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.world_vector
    - action.rotation_delta
    - action.gripper_position
    normalization_modes:
      action.gripper_position: binary
    target_rotations:
      action.rotation_delta: axis_angle
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.image_pad_res256_freq03
    state_concat_order:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closedness_commanded
    action_concat_order:
    - action.world_vector
    - action.rotation_delta
    - action.gripper_position
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_oxe_language_table:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.rgb_pad_res256_freq10
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.effector_translation
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.action
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.language.instruction
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_oxe_language_table:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.rgb_pad_res256_freq10
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.rgb_pad_res256_freq10
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.rgb_pad_res256_freq10
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.rgb_pad_res256_freq10
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.rgb_pad_res256_freq10
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.effector_translation
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.effector_translation
    normalization_modes:
      state.effector_translation: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.action
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.action
    normalization_modes:
      action.action: min_max
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.rgb_pad_res256_freq10
    state_concat_order:
    - state.effector_translation
    action_concat_order:
    - action.action
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_oxe_bridge:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.image_0
    - video.image_1
    - video.image_2
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closed
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.language.language_instruction
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_oxe_bridge:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.image_0
    - video.image_1
    - video.image_2
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.image_0
    - video.image_1
    - video.image_2
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.image_0
    - video.image_1
    - video.image_2
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.image_0
    - video.image_1
    - video.image_2
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.image_0
    - video.image_1
    - video.image_2
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closed
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closed
    normalization_modes:
      state.eef_position: min_max
      state.gripper_closed: min_max
    target_rotations:
      state.eef_rotation: rotation_6d
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
    normalization_modes:
      action.gripper_position: binary
    target_rotations:
      action.eef_rotation: axis_angle
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.image_0
    - video.image_1
    - video.image_2
    state_concat_order:
    - state.eef_position
    - state.eef_rotation
    - state.gripper_closed
    action_concat_order:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_hot3d_hands_only:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.ego_view
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_wrist_position
    - state.left_wrist_rotation
    - state.left_joint_rotation
    - state.right_wrist_position
    - state.right_wrist_rotation
    - state.right_joint_rotation
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.left_wrist_position
    - action.left_wrist_rotation
    - action.left_joint_rotation
    - action.right_wrist_position
    - action.right_wrist_rotation
    - action.right_joint_rotation
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_hot3d_hands_only:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.ego_view
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.ego_view
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.ego_view
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.ego_view
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.ego_view
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_wrist_position
    - state.left_wrist_rotation
    - state.left_joint_rotation
    - state.right_wrist_position
    - state.right_wrist_rotation
    - state.right_joint_rotation
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.left_wrist_position
    - state.left_wrist_rotation
    - state.left_joint_rotation
    - state.right_wrist_position
    - state.right_wrist_rotation
    - state.right_joint_rotation
    normalization_modes:
      state.left_wrist_position: min_max
      state.right_wrist_position: min_max
    target_rotations:
      state.left_wrist_rotation: quaternion
      state.right_wrist_rotation: quaternion
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.left_wrist_position
    - action.left_wrist_rotation
    - action.left_joint_rotation
    - action.right_wrist_position
    - action.right_wrist_rotation
    - action.right_joint_rotation
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.left_wrist_position
    - action.left_wrist_rotation
    - action.left_joint_rotation
    - action.right_wrist_position
    - action.right_wrist_rotation
    - action.right_joint_rotation
    normalization_modes:
      action.left_wrist_position: min_max
      action.right_wrist_position: min_max
    target_rotations:
      action.left_wrist_rotation: quaternion
      action.right_wrist_rotation: quaternion
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.ego_view
    state_concat_order:
    - state.left_wrist_position
    - state.left_wrist_rotation
    - state.left_joint_rotation
    - state.right_wrist_position
    - state.right_wrist_rotation
    - state.right_joint_rotation
    action_concat_order:
    - action.left_wrist_position
    - action.left_wrist_rotation
    - action.left_joint_rotation
    - action.right_wrist_position
    - action.right_wrist_rotation
    - action.right_joint_rotation
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_agibot:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.top_head
    - video.hand_left
    - video.hand_right
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_arm_joint_position
    - state.right_arm_joint_position
    - state.left_effector_position
    - state.right_effector_position
    - state.head_position
    - state.waist_position
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.left_arm_joint_position
    - action.right_arm_joint_position
    - action.left_effector_position
    - action.right_effector_position
    - action.head_position
    - action.waist_position
    - action.robot_velocity
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.agibot.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_agibot:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.top_head
    - video.hand_left
    - video.hand_right
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.top_head
    - video.hand_left
    - video.hand_right
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.top_head
    - video.hand_left
    - video.hand_right
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.top_head
    - video.hand_left
    - video.hand_right
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.top_head
    - video.hand_left
    - video.hand_right
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_arm_joint_position
    - state.right_arm_joint_position
    - state.left_effector_position
    - state.right_effector_position
    - state.head_position
    - state.waist_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.left_arm_joint_position
    - state.right_arm_joint_position
    - state.left_effector_position
    - state.right_effector_position
    - state.head_position
    - state.waist_position
    normalization_modes:
      state.left_arm_joint_position: min_max
      state.right_arm_joint_position: min_max
      state.left_effector_position: min_max
      state.right_effector_position: min_max
      state.head_position: min_max
      state.waist_position: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.left_arm_joint_position
    - action.right_arm_joint_position
    - action.left_effector_position
    - action.right_effector_position
    - action.head_position
    - action.waist_position
    - action.robot_velocity
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.left_arm_joint_position
    - action.right_arm_joint_position
    - action.left_effector_position
    - action.right_effector_position
    - action.head_position
    - action.waist_position
    - action.robot_velocity
    normalization_modes:
      action.left_arm_joint_position: min_max
      action.right_arm_joint_position: min_max
      action.left_effector_position: min_max
      action.right_effector_position: min_max
      action.head_position: min_max
      action.waist_position: min_max
      action.robot_velocity: min_max
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.top_head
    - video.hand_left
    - video.hand_right
    state_concat_order:
    - state.left_arm_joint_position
    - state.right_arm_joint_position
    - state.left_effector_position
    - state.right_effector_position
    - state.head_position
    - state.waist_position
    action_concat_order:
    - action.left_arm_joint_position
    - action.right_arm_joint_position
    - action.left_effector_position
    - action.right_effector_position
    - action.head_position
    - action.waist_position
    - action.robot_velocity
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_oxe_mutex:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.image
    - video.wrist_image
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.joint_angles
    - state.gripper_closed
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.language.language_instruction
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_oxe_mutex:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.image
    - video.wrist_image
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.image
    - video.wrist_image
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.image
    - video.wrist_image
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.image
    - video.wrist_image
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.image
    - video.wrist_image
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.joint_angles
    - state.gripper_closed
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.joint_angles
    - state.gripper_closed
    normalization_modes:
      state.joint_angles: min_max
      state.gripper_closed: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
    normalization_modes:
      action.gripper_position: binary
    target_rotations:
      action.eef_rotation: axis_angle
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.image
    - video.wrist_image
    state_concat_order:
    - state.joint_angles
    - state.gripper_closed
    action_concat_order:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_oxe_plex:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.image
    - video.wrist_image
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.state
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.language.language_instruction
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_oxe_plex:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.image
    - video.wrist_image
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.image
    - video.wrist_image
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.image
    - video.wrist_image
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.image
    - video.wrist_image
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.image
    - video.wrist_image
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.state
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.state
    normalization_modes:
      state.state: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
    normalization_modes:
      action.gripper_position: binary
    target_rotations:
      action.eef_rotation: axis_angle
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.image
    - video.wrist_image
    state_concat_order:
    - state.state
    action_concat_order:
    - action.eef_position
    - action.eef_rotation
    - action.gripper_position
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_oxe_roboset:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.image_left
    - video.image_right
    - video.image_wrist
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.joint_position
    - state.gripper_closed
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.joint_position
    - action.gripper_position
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.language.language_instruction
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_oxe_roboset:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.image_left
    - video.image_right
    - video.image_wrist
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.image_left
    - video.image_right
    - video.image_wrist
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.image_left
    - video.image_right
    - video.image_wrist
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.image_left
    - video.image_right
    - video.image_wrist
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.image_left
    - video.image_right
    - video.image_wrist
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.joint_position
    - state.gripper_closed
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - state.joint_position
    - state.gripper_closed
    normalization_modes:
      state.joint_position: min_max
      state.gripper_closed: min_max
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.joint_position
    - action.gripper_position
  - _target_: gr00t.data.transform.StateActionTransform
    apply_to:
    - action.joint_position
    - action.gripper_position
    normalization_modes:
      action.joint_position: min_max
      action.gripper_position: binary
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.image_left
    - video.image_right
    - video.image_wrist
    state_concat_order:
    - state.joint_position
    - state.gripper_closed
    action_concat_order:
    - action.joint_position
    - action.gripper_position
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_lapa:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.ego
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.action.task_description
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_lapa:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.ego
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.ego
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.ego
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.ego
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.ego
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.ego
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_config_dream:
  video:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 16
    modality_keys:
    - video.ego_view_bg_crop_pad_res256_freq20
  state:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    - 1
    - 2
    - 3
    - 4
    - 5
    - 6
    - 7
    - 8
    - 9
    - 10
    - 11
    - 12
    - 13
    - 14
    - 15
    modality_keys:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  language:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - annotation.human.coarse_action
  lapa_action:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - lapa_action
  dream_actions:
    _target_: gr00t.data.dataset.ModalityConfig
    delta_indices:
    - 0
    modality_keys:
    - dream_actions
transform_dream:
  _target_: gr00t.data.transform.ComposedModalityTransform
  transforms:
  - _target_: gr00t.data.transform.VideoToTensor
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
  - _target_: gr00t.data.transform.VideoCrop
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
    scale: 0.95
    mode: random
  - _target_: gr00t.data.transform.VideoResize
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
    height: 224
    width: 224
    interpolation: linear
  - _target_: gr00t.data.transform.VideoColorJitter
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
    brightness: 0.3
    contrast: 0.4
    saturation: 0.5
    hue: 0.08
  - _target_: gr00t.data.transform.VideoToNumpy
    apply_to:
    - video.ego_view_bg_crop_pad_res256_freq20
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
  - _target_: gr00t.data.transform.StateActionToTensor
    apply_to:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  - _target_: gr00t.data.transform.ConcatTransform
    video_concat_order:
    - video.ego_view_bg_crop_pad_res256_freq20
    state_concat_order:
    - state.left_arm
    - state.right_arm
    - state.left_hand
    - state.right_hand
    - state.waist
    action_concat_order:
    - action.left_arm
    - action.right_arm
    - action.left_hand
    - action.right_hand
    - action.waist
  - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
    default_instruction: Perform the default behavior.
    num_visual_tokens_per_frame: 16
    max_num_images_per_sequence: 6
    max_action_dim: 32
    max_sequence_length: 112
    action_horizon: 16
    siglip_processor:
      _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
      _convert_: object
      pretrained_model_name_or_path: google/siglip2-large-patch16-256
    embodiment_tag_mapping:
      real_gr1_arms_only: 0
      real_gr1_arms_only_annotated: 1
      real_gr1_arms_waist: 2
      real_gr1_arms_waist_annotated: 3
      dexmg_gr1_arms_only_inspire: 4
      dexmg_gr1_arms_only_fourier: 5
      dexmg_gr1_arms_waist_fourier: 6
      robocasa_single_arm: 7
      onex_eve_gripper: 8
      robocasa_gr1_arms_only_inspire_hands: 9
      robocasa_gr1_arms_only_fourier_hands: 10
      robocasa_gr1_fixed_lower_body_inspire_hands: 11
      robocasa_gr1_fixed_lower_body_fourier_hands: 12
      robocasa_panda_omron: 13
      robocasa_bimanual_panda_parallel_gripper: 15
      robocasa_bimanual_panda_inspire_hand: 16
      oxe_droid: 17
      oxe_fractal: 18
      oxe_language_table: 19
      oxe_bridge: 20
      real_panda_single_arm: 21
      unknown: 22
      hot3d_hands_only: 23
      gr1_unified: 24
      robocasa_gr1_arms_waist_fourier_hands: 25
      lapa: 27
      oxe_mutex: 28
      oxe_roboset: 29
      oxe_plex: 30
      dream: 31
      gr1_unified_segmentation: 14
modality_configs:
  robocasa_gr1_arms_only_fourier_hands:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.ego_view_pad_res256_freq20
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.action.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  robocasa_gr1_arms_waist_fourier_hands:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.ego_view_pad_res256_freq20
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.action.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  robocasa_gr1_fixed_lower_body_fourier_hands:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.agentview_pad_res256_freq20
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      - state.neck
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
      - action.neck
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.action.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  robocasa_bimanual_panda_parallel_gripper:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_gripper_qpos
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_gripper_qpos
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_gripper_close
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_gripper_close
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.action.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  robocasa_bimanual_panda_inspire_hand:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_hand
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_hand
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_hand
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_hand
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.action.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  robocasa_panda_omron:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.res256_image_side_0
      - video.res256_image_side_1
      - video.res256_image_wrist_0
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.end_effector_position_relative
      - state.end_effector_rotation_relative
      - state.gripper_qpos
      - state.base_position
      - state.base_rotation
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.end_effector_position
      - action.end_effector_rotation
      - action.gripper_close
      - action.base_motion
      - action.control_mode
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.action.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  gr1_unified:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.ego_view
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.coarse_action
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  oxe_droid:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.exterior_image_1_left_pad_res256_freq15
      - video.exterior_image_2_left_pad_res256_freq15
      - video.wrist_image_left_pad_res256_freq15
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_position
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.eef_position_delta
      - action.eef_rotation_delta
      - action.gripper_position
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.language.language_instruction
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  oxe_fractal:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.image_pad_res256_freq03
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closedness_commanded
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.world_vector
      - action.rotation_delta
      - action.gripper_position
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.language.natural_language_instruction
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  oxe_language_table:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.rgb_pad_res256_freq10
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.effector_translation
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.action
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.language.instruction
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  oxe_bridge:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.image_0
      - video.image_1
      - video.image_2
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closed
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.language.language_instruction
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  oxe_mutex:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.image
      - video.wrist_image
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.joint_angles
      - state.gripper_closed
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.language.language_instruction
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  oxe_plex:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.image
      - video.wrist_image
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.state
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.language.language_instruction
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  oxe_roboset:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.image_left
      - video.image_right
      - video.image_wrist
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.joint_position
      - state.gripper_closed
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.joint_position
      - action.gripper_position
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.language.language_instruction
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  hot3d_hands_only:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.ego_view
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_wrist_position
      - state.left_wrist_rotation
      - state.left_joint_rotation
      - state.right_wrist_position
      - state.right_wrist_rotation
      - state.right_joint_rotation
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.left_wrist_position
      - action.left_wrist_rotation
      - action.left_joint_rotation
      - action.right_wrist_position
      - action.right_wrist_rotation
      - action.right_joint_rotation
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  agibot:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.top_head
      - video.hand_left
      - video.hand_right
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_arm_joint_position
      - state.right_arm_joint_position
      - state.left_effector_position
      - state.right_effector_position
      - state.head_position
      - state.waist_position
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.left_arm_joint_position
      - action.right_arm_joint_position
      - action.left_effector_position
      - action.right_effector_position
      - action.head_position
      - action.waist_position
      - action.robot_velocity
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.agibot.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  lapa:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.ego
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.action.task_description
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  dream:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.ego_view_bg_crop_pad_res256_freq20
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.coarse_action
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
  gr1_unified_segmentation:
    video:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 16
      modality_keys:
      - video.ego_view_bg_crop_pad_res256_freq20
    state:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      - 7
      - 8
      - 9
      - 10
      - 11
      - 12
      - 13
      - 14
      - 15
      modality_keys:
      - action.segmentation_target
      - action.segmentation_target_mask
    language:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - annotation.human.coarse_action
    lapa_action:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - lapa_action
    dream_actions:
      _target_: gr00t.data.dataset.ModalityConfig
      delta_indices:
      - 0
      modality_keys:
      - dream_actions
transforms:
  robocasa_gr1_arms_only_fourier_hands:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.ego_view_pad_res256_freq20
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.ego_view_pad_res256_freq20
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.ego_view_pad_res256_freq20
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.ego_view_pad_res256_freq20
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.ego_view_pad_res256_freq20
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      normalization_modes:
        state.left_arm: min_max
        state.right_arm: min_max
        state.left_hand: min_max
        state.right_hand: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      normalization_modes:
        action.right_arm: min_max
        action.left_arm: min_max
        action.right_hand: min_max
        action.left_hand: min_max
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.ego_view_pad_res256_freq20
      state_concat_order:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      action_concat_order:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  robocasa_gr1_arms_waist_fourier_hands:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.ego_view_pad_res256_freq20
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.ego_view_pad_res256_freq20
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.ego_view_pad_res256_freq20
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.ego_view_pad_res256_freq20
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.ego_view_pad_res256_freq20
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      normalization_modes:
        state.left_arm: min_max
        state.right_arm: min_max
        state.left_hand: min_max
        state.right_hand: min_max
        state.waist: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
      normalization_modes:
        action.right_arm: min_max
        action.left_arm: min_max
        action.right_hand: min_max
        action.left_hand: min_max
        action.waist: min_max
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.ego_view_pad_res256_freq20
      state_concat_order:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      action_concat_order:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  robocasa_gr1_fixed_lower_body_fourier_hands:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.agentview_pad_res256_freq20
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.agentview_pad_res256_freq20
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.agentview_pad_res256_freq20
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.agentview_pad_res256_freq20
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.agentview_pad_res256_freq20
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      - state.neck
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      - state.neck
      normalization_modes:
        state.left_arm: min_max
        state.right_arm: min_max
        state.left_hand: min_max
        state.right_hand: min_max
        state.waist: min_max
        state.neck: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
      - action.neck
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
      - action.neck
      normalization_modes:
        action.right_arm: min_max
        action.left_arm: min_max
        action.right_hand: min_max
        action.left_hand: min_max
        action.waist: min_max
        action.neck: min_max
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.agentview_pad_res256_freq20
      state_concat_order:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      - state.neck
      action_concat_order:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
      - action.neck
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  robocasa_bimanual_panda_parallel_gripper:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_gripper_qpos
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_gripper_qpos
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_gripper_qpos
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_gripper_qpos
      normalization_modes:
        state.right_arm_eef_pos: min_max
        state.right_gripper_qpos: min_max
        state.left_arm_eef_pos: min_max
        state.left_gripper_qpos: min_max
      target_rotations:
        state.right_arm_eef_quat: rotation_6d
        state.left_arm_eef_quat: rotation_6d
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_gripper_close
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_gripper_close
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_gripper_close
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_gripper_close
      normalization_modes:
        action.right_gripper_close: binary
        action.left_gripper_close: binary
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      state_concat_order:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_gripper_qpos
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_gripper_qpos
      action_concat_order:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_gripper_close
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_gripper_close
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  robocasa_bimanual_panda_inspire_hand:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_hand
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_hand
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_hand
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_hand
      normalization_modes:
        state.right_arm_eef_pos: min_max
        state.right_hand: min_max
        state.left_arm_eef_pos: min_max
        state.left_hand: min_max
      target_rotations:
        state.right_arm_eef_quat: rotation_6d
        state.left_arm_eef_quat: rotation_6d
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_hand
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_hand
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_hand
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_hand
      normalization_modes:
        action.right_hand: min_max
        action.left_hand: min_max
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.robot0_eye_in_hand_pad_res256_freq20
      - video.robot1_eye_in_hand_pad_res256_freq20
      - video.agentview_pad_res256_freq20
      state_concat_order:
      - state.right_arm_eef_pos
      - state.right_arm_eef_quat
      - state.right_hand
      - state.left_arm_eef_pos
      - state.left_arm_eef_quat
      - state.left_hand
      action_concat_order:
      - action.right_arm_eef_pos
      - action.right_arm_eef_rot
      - action.right_hand
      - action.left_arm_eef_pos
      - action.left_arm_eef_rot
      - action.left_hand
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  robocasa_panda_omron:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.res256_image_side_0
      - video.res256_image_side_1
      - video.res256_image_wrist_0
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.res256_image_side_0
      - video.res256_image_side_1
      - video.res256_image_wrist_0
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.res256_image_side_0
      - video.res256_image_side_1
      - video.res256_image_wrist_0
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.res256_image_side_0
      - video.res256_image_side_1
      - video.res256_image_wrist_0
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.res256_image_side_0
      - video.res256_image_side_1
      - video.res256_image_wrist_0
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.end_effector_position_relative
      - state.end_effector_rotation_relative
      - state.gripper_qpos
      - state.base_position
      - state.base_rotation
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.end_effector_position_relative
      - state.end_effector_rotation_relative
      - state.gripper_qpos
      - state.base_position
      - state.base_rotation
      normalization_modes:
        state.end_effector_position_relative: min_max
        state.end_effector_rotation_relative: min_max
        state.gripper_qpos: min_max
        state.base_position: min_max
        state.base_rotation: min_max
      target_rotations:
        state.end_effector_rotation_relative: rotation_6d
        state.base_rotation: rotation_6d
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.end_effector_position
      - action.end_effector_rotation
      - action.gripper_close
      - action.base_motion
      - action.control_mode
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.end_effector_position
      - action.end_effector_rotation
      - action.gripper_close
      - action.base_motion
      - action.control_mode
      normalization_modes:
        action.end_effector_position: min_max
        action.end_effector_rotation: min_max
        action.gripper_close: binary
        action.base_motion: min_max
        action.control_mode: binary
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.res256_image_side_0
      - video.res256_image_side_1
      - video.res256_image_wrist_0
      state_concat_order:
      - state.end_effector_position_relative
      - state.end_effector_rotation_relative
      - state.gripper_qpos
      - state.base_position
      - state.base_rotation
      action_concat_order:
      - action.end_effector_position
      - action.end_effector_rotation
      - action.gripper_close
      - action.base_motion
      - action.control_mode
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  gr1_unified:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.ego_view
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.ego_view
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.ego_view
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.ego_view
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.ego_view
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    - _target_: gr00t.data.transform.StateActionSinCosTransform
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
      normalization_modes:
        action.left_arm: min_max
        action.right_arm: min_max
        action.left_hand: min_max
        action.right_hand: min_max
        action.waist: min_max
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.ego_view_bg_crop_pad_res256_freq20
      state_concat_order:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      action_concat_order:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  oxe_droid:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.exterior_image_1_left_pad_res256_freq15
      - video.exterior_image_2_left_pad_res256_freq15
      - video.wrist_image_left_pad_res256_freq15
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.exterior_image_1_left_pad_res256_freq15
      - video.exterior_image_2_left_pad_res256_freq15
      - video.wrist_image_left_pad_res256_freq15
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.exterior_image_1_left_pad_res256_freq15
      - video.exterior_image_2_left_pad_res256_freq15
      - video.wrist_image_left_pad_res256_freq15
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.exterior_image_1_left_pad_res256_freq15
      - video.exterior_image_2_left_pad_res256_freq15
      - video.wrist_image_left_pad_res256_freq15
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.exterior_image_1_left_pad_res256_freq15
      - video.exterior_image_2_left_pad_res256_freq15
      - video.wrist_image_left_pad_res256_freq15
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_position
      normalization_modes:
        state.eef_position: min_max
        state.gripper_position: min_max
      target_rotations:
        state.eef_rotation: rotation_6d
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.eef_position_delta
      - action.eef_rotation_delta
      - action.gripper_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.eef_position_delta
      - action.eef_rotation_delta
      - action.gripper_position
      normalization_modes:
        action.gripper_position: binary
      target_rotations:
        action.eef_rotation_delta: axis_angle
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.exterior_image_1_left_pad_res256_freq15
      - video.exterior_image_2_left_pad_res256_freq15
      - video.wrist_image_left_pad_res256_freq15
      state_concat_order:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_position
      action_concat_order:
      - action.eef_position_delta
      - action.eef_rotation_delta
      - action.gripper_position
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  oxe_fractal:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.image_pad_res256_freq03
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.image_pad_res256_freq03
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.image_pad_res256_freq03
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.image_pad_res256_freq03
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.image_pad_res256_freq03
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closedness_commanded
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closedness_commanded
      normalization_modes:
        state.eef_position: min_max
        state.gripper_closedness_commanded: min_max
      target_rotations:
        state.eef_rotation: rotation_6d
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.world_vector
      - action.rotation_delta
      - action.gripper_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.world_vector
      - action.rotation_delta
      - action.gripper_position
      normalization_modes:
        action.gripper_position: binary
      target_rotations:
        action.rotation_delta: axis_angle
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.image_pad_res256_freq03
      state_concat_order:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closedness_commanded
      action_concat_order:
      - action.world_vector
      - action.rotation_delta
      - action.gripper_position
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  oxe_language_table:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.rgb_pad_res256_freq10
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.rgb_pad_res256_freq10
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.rgb_pad_res256_freq10
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.rgb_pad_res256_freq10
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.rgb_pad_res256_freq10
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.effector_translation
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.effector_translation
      normalization_modes:
        state.effector_translation: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.action
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.action
      normalization_modes:
        action.action: min_max
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.rgb_pad_res256_freq10
      state_concat_order:
      - state.effector_translation
      action_concat_order:
      - action.action
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  oxe_bridge:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.image_0
      - video.image_1
      - video.image_2
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.image_0
      - video.image_1
      - video.image_2
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.image_0
      - video.image_1
      - video.image_2
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.image_0
      - video.image_1
      - video.image_2
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.image_0
      - video.image_1
      - video.image_2
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closed
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closed
      normalization_modes:
        state.eef_position: min_max
        state.gripper_closed: min_max
      target_rotations:
        state.eef_rotation: rotation_6d
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
      normalization_modes:
        action.gripper_position: binary
      target_rotations:
        action.eef_rotation: axis_angle
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.image_0
      - video.image_1
      - video.image_2
      state_concat_order:
      - state.eef_position
      - state.eef_rotation
      - state.gripper_closed
      action_concat_order:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  hot3d_hands_only:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.ego_view
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.ego_view
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.ego_view
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.ego_view
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.ego_view
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_wrist_position
      - state.left_wrist_rotation
      - state.left_joint_rotation
      - state.right_wrist_position
      - state.right_wrist_rotation
      - state.right_joint_rotation
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.left_wrist_position
      - state.left_wrist_rotation
      - state.left_joint_rotation
      - state.right_wrist_position
      - state.right_wrist_rotation
      - state.right_joint_rotation
      normalization_modes:
        state.left_wrist_position: min_max
        state.right_wrist_position: min_max
      target_rotations:
        state.left_wrist_rotation: quaternion
        state.right_wrist_rotation: quaternion
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.left_wrist_position
      - action.left_wrist_rotation
      - action.left_joint_rotation
      - action.right_wrist_position
      - action.right_wrist_rotation
      - action.right_joint_rotation
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.left_wrist_position
      - action.left_wrist_rotation
      - action.left_joint_rotation
      - action.right_wrist_position
      - action.right_wrist_rotation
      - action.right_joint_rotation
      normalization_modes:
        action.left_wrist_position: min_max
        action.right_wrist_position: min_max
      target_rotations:
        action.left_wrist_rotation: quaternion
        action.right_wrist_rotation: quaternion
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.ego_view
      state_concat_order:
      - state.left_wrist_position
      - state.left_wrist_rotation
      - state.left_joint_rotation
      - state.right_wrist_position
      - state.right_wrist_rotation
      - state.right_joint_rotation
      action_concat_order:
      - action.left_wrist_position
      - action.left_wrist_rotation
      - action.left_joint_rotation
      - action.right_wrist_position
      - action.right_wrist_rotation
      - action.right_joint_rotation
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  agibot:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.top_head
      - video.hand_left
      - video.hand_right
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.top_head
      - video.hand_left
      - video.hand_right
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.top_head
      - video.hand_left
      - video.hand_right
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.top_head
      - video.hand_left
      - video.hand_right
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.top_head
      - video.hand_left
      - video.hand_right
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_arm_joint_position
      - state.right_arm_joint_position
      - state.left_effector_position
      - state.right_effector_position
      - state.head_position
      - state.waist_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.left_arm_joint_position
      - state.right_arm_joint_position
      - state.left_effector_position
      - state.right_effector_position
      - state.head_position
      - state.waist_position
      normalization_modes:
        state.left_arm_joint_position: min_max
        state.right_arm_joint_position: min_max
        state.left_effector_position: min_max
        state.right_effector_position: min_max
        state.head_position: min_max
        state.waist_position: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.left_arm_joint_position
      - action.right_arm_joint_position
      - action.left_effector_position
      - action.right_effector_position
      - action.head_position
      - action.waist_position
      - action.robot_velocity
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.left_arm_joint_position
      - action.right_arm_joint_position
      - action.left_effector_position
      - action.right_effector_position
      - action.head_position
      - action.waist_position
      - action.robot_velocity
      normalization_modes:
        action.left_arm_joint_position: min_max
        action.right_arm_joint_position: min_max
        action.left_effector_position: min_max
        action.right_effector_position: min_max
        action.head_position: min_max
        action.waist_position: min_max
        action.robot_velocity: min_max
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.top_head
      - video.hand_left
      - video.hand_right
      state_concat_order:
      - state.left_arm_joint_position
      - state.right_arm_joint_position
      - state.left_effector_position
      - state.right_effector_position
      - state.head_position
      - state.waist_position
      action_concat_order:
      - action.left_arm_joint_position
      - action.right_arm_joint_position
      - action.left_effector_position
      - action.right_effector_position
      - action.head_position
      - action.waist_position
      - action.robot_velocity
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  oxe_mutex:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.image
      - video.wrist_image
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.image
      - video.wrist_image
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.image
      - video.wrist_image
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.image
      - video.wrist_image
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.image
      - video.wrist_image
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.joint_angles
      - state.gripper_closed
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.joint_angles
      - state.gripper_closed
      normalization_modes:
        state.joint_angles: min_max
        state.gripper_closed: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
      normalization_modes:
        action.gripper_position: binary
      target_rotations:
        action.eef_rotation: axis_angle
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.image
      - video.wrist_image
      state_concat_order:
      - state.joint_angles
      - state.gripper_closed
      action_concat_order:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  oxe_plex:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.image
      - video.wrist_image
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.image
      - video.wrist_image
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.image
      - video.wrist_image
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.image
      - video.wrist_image
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.image
      - video.wrist_image
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.state
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.state
      normalization_modes:
        state.state: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
      normalization_modes:
        action.gripper_position: binary
      target_rotations:
        action.eef_rotation: axis_angle
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.image
      - video.wrist_image
      state_concat_order:
      - state.state
      action_concat_order:
      - action.eef_position
      - action.eef_rotation
      - action.gripper_position
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  oxe_roboset:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.image_left
      - video.image_right
      - video.image_wrist
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.image_left
      - video.image_right
      - video.image_wrist
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.image_left
      - video.image_right
      - video.image_wrist
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.image_left
      - video.image_right
      - video.image_wrist
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.image_left
      - video.image_right
      - video.image_wrist
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.joint_position
      - state.gripper_closed
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - state.joint_position
      - state.gripper_closed
      normalization_modes:
        state.joint_position: min_max
        state.gripper_closed: min_max
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.joint_position
      - action.gripper_position
    - _target_: gr00t.data.transform.StateActionTransform
      apply_to:
      - action.joint_position
      - action.gripper_position
      normalization_modes:
        action.joint_position: min_max
        action.gripper_position: binary
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.image_left
      - video.image_right
      - video.image_wrist
      state_concat_order:
      - state.joint_position
      - state.gripper_closed
      action_concat_order:
      - action.joint_position
      - action.gripper_position
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  lapa:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.ego
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.ego
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.ego
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.ego
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.ego
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.ego
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  dream:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
    - _target_: gr00t.data.transform.VideoCrop
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
      scale: 0.95
      mode: random
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.ego_view_bg_crop_pad_res256_freq20
      state_concat_order:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      action_concat_order:
      - action.left_arm
      - action.right_arm
      - action.left_hand
      - action.right_hand
      - action.waist
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
  gr1_unified_segmentation:
    _target_: gr00t.data.transform.ComposedModalityTransform
    transforms:
    - _target_: gr00t.data.transform.VideoToTensor
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
    - _target_: gr00t.data.transform.VideoResize
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
      height: 224
      width: 224
      interpolation: linear
    - _target_: gr00t.data.transform.VideoColorJitter
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
      brightness: 0.3
      contrast: 0.4
      saturation: 0.5
      hue: 0.08
    - _target_: gr00t.data.transform.VideoToNumpy
      apply_to:
      - video.ego_view_bg_crop_pad_res256_freq20
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    - _target_: gr00t.data.transform.StateActionSinCosTransform
      apply_to:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
    - _target_: gr00t.data.transform.StateActionToTensor
      apply_to:
      - action.segmentation_target
      - action.segmentation_target_mask
    - _target_: gr00t.data.transform.ConcatTransform
      video_concat_order:
      - video.ego_view_bg_crop_pad_res256_freq20
      state_concat_order:
      - state.left_arm
      - state.right_arm
      - state.left_hand
      - state.right_hand
      - state.waist
      action_concat_order:
      - action.segmentation_target
      - action.segmentation_target_mask
    - _target_: gr00t.model.transforms_idm.GR00TIDMTransform
      default_instruction: Perform the default behavior.
      num_visual_tokens_per_frame: 16
      max_num_images_per_sequence: 6
      max_action_dim: 32
      max_sequence_length: 112
      action_horizon: 16
      siglip_processor:
        _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained
        _convert_: object
        pretrained_model_name_or_path: google/siglip2-large-patch16-256
      embodiment_tag_mapping:
        real_gr1_arms_only: 0
        real_gr1_arms_only_annotated: 1
        real_gr1_arms_waist: 2
        real_gr1_arms_waist_annotated: 3
        dexmg_gr1_arms_only_inspire: 4
        dexmg_gr1_arms_only_fourier: 5
        dexmg_gr1_arms_waist_fourier: 6
        robocasa_single_arm: 7
        onex_eve_gripper: 8
        robocasa_gr1_arms_only_inspire_hands: 9
        robocasa_gr1_arms_only_fourier_hands: 10
        robocasa_gr1_fixed_lower_body_inspire_hands: 11
        robocasa_gr1_fixed_lower_body_fourier_hands: 12
        robocasa_panda_omron: 13
        robocasa_bimanual_panda_parallel_gripper: 15
        robocasa_bimanual_panda_inspire_hand: 16
        oxe_droid: 17
        oxe_fractal: 18
        oxe_language_table: 19
        oxe_bridge: 20
        real_panda_single_arm: 21
        unknown: 22
        hot3d_hands_only: 23
        gr1_unified: 24
        robocasa_gr1_arms_waist_fourier_hands: 25
        lapa: 27
        oxe_mutex: 28
        oxe_roboset: 29
        oxe_plex: 30
        dream: 31
        gr1_unified_segmentation: 14
metadata_versions:
  robocasa_gr1_arms_only_fourier_hands: '0217'
  robocasa_gr1_fixed_lower_body_fourier_hands: '0217'
  robocasa_bimanual_panda_parallel_gripper: '0217'
  robocasa_bimanual_panda_inspire_hand: '0217'
  robocasa_panda_omron: '0217'
  gr1_unified: '0304'
  oxe_droid: '0221'
  oxe_fractal: '0221'
  oxe_language_table: '0221'
  oxe_bridge: '0221'
  robocasa_gr1_arms_waist_fourier_hands: '0225'
  hot3d_hands_only: '0220'
  agibot: '0306'
  oxe_mutex: '0303'
  oxe_plex: '0303'
  oxe_roboset: '0303'
  lapa: '0305'
  dream: '0308'
  gr1_unified_segmentation: '0309'
max_state_dim: 44
dataset_shard_sampling_rate: 0.1
mixture_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
single_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset
data_root: /mnt/amlfs-03/shared/datasets
gr00t_commit_hash: 83c31e50e727eb21e80857dea54541752d89811f
total_training_steps: 16384000000