model: _target_: gr00t.model.idm.IDM _convert_: object config: _target_: gr00t.model.idm.IDMConfig _recursive_: false model_dtype: float32 hidden_size: 0 action_horizon: 16 action_dim: 32 backbone_cfg: _target_: gr00t.model.backbone.IdentityBackbone action_head_cfg: _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM _convert_: object config: _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig _recursive_: false add_seperator_token: true add_pos_embed: true model_dtype: float32 mm_vision_select_layer: -2 max_state_dim: 44 max_action_dim: 32 hidden_size: 1024 tune_vision_tower: true add_view_embed: true max_num_views: 3 siglip_model_cfg: _target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 siglip_hidden_size: 1024 vl_self_attention_cfg: _target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer positional_embeddings: null num_layers: 4 num_attention_heads: 16 attention_head_dim: 64 dropout: 0.2 final_dropout: true diffusion_model_cfg: _target_: gr00t.model.action_head.cross_attention_dit.DiT positional_embeddings: null num_layers: 8 num_attention_heads: 16 attention_head_dim: 64 norm_type: ada_norm dropout: 0.2 final_dropout: true output_dim: 1024 interleave_self_attention: true mm_projector_cfg: _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector _convert_: object config: _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig hidden_size: 1024 mm_hidden_size: 1024 mm_projector_type: mlp_doubledownsample action_dim: 32 action_horizon: 16 num_inference_timesteps: 16 noise_beta_alpha: 1.5 noise_beta_beta: 1.0 noise_s: 0.999 num_timestep_buckets: 1000 backbone_features_projector_cfg: null train_dataset: _target_: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec _convert_: object mixture_spec: - dataset_path: - /mnt/amlfs-01/home/seonghyeony/data/2RA_idm/gr1_unified.UnzeroedArmsOnlyRemoveStaticSliceBGCropPad256Freq20_train - /mnt/amlfs-01/home/seonghyeony/data/2RA_idm/gr1_unified.UnzeroedArmsWaistRemoveStaticSliceBGCropPad256Freq20_train dataset_weight: 1.0 dataset_class: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset all_modality_configs: robocasa_gr1_arms_only_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_gr1_arms_waist_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_gr1_fixed_lower_body_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_bimanual_panda_parallel_gripper: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_bimanual_panda_inspire_hand: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_panda_omron: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions gr1_unified: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_droid: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_position action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_fractal: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_pad_res256_freq03 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.world_vector - action.rotation_delta - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.natural_language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_language_table: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.rgb_pad_res256_freq10 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.effector_translation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.action language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_bridge: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_0 - video.image_1 - video.image_2 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_mutex: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image - video.wrist_image state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.joint_angles - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_plex: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image - video.wrist_image state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.state action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_roboset: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_left - video.image_right - video.image_wrist state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.joint_position - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.joint_position - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions hot3d_hands_only: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions agibot: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.top_head - video.hand_left - video.hand_right state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.agibot.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions lapa: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions dream: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_bg_crop_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions gr1_unified_segmentation: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_bg_crop_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.segmentation_target - action.segmentation_target_mask language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions all_transforms: robocasa_gr1_arms_only_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_gr1_arms_waist_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max state.waist: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max action.waist: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_gr1_fixed_lower_body_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max state.waist: min_max state.neck: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max action.waist: min_max action.neck: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.agentview_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_bimanual_panda_parallel_gripper: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos normalization_modes: state.right_arm_eef_pos: min_max state.right_gripper_qpos: min_max state.left_arm_eef_pos: min_max state.left_gripper_qpos: min_max target_rotations: state.right_arm_eef_quat: rotation_6d state.left_arm_eef_quat: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close normalization_modes: action.right_gripper_close: binary action.left_gripper_close: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state_concat_order: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos action_concat_order: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_bimanual_panda_inspire_hand: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand normalization_modes: state.right_arm_eef_pos: min_max state.right_hand: min_max state.left_arm_eef_pos: min_max state.left_hand: min_max target_rotations: state.right_arm_eef_quat: rotation_6d state.left_arm_eef_quat: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand normalization_modes: action.right_hand: min_max action.left_hand: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state_concat_order: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand action_concat_order: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_panda_omron: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation normalization_modes: state.end_effector_position_relative: min_max state.end_effector_rotation_relative: min_max state.gripper_qpos: min_max state.base_position: min_max state.base_rotation: min_max target_rotations: state.end_effector_rotation_relative: rotation_6d state.base_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode normalization_modes: action.end_effector_position: min_max action.end_effector_rotation: min_max action.gripper_close: binary action.base_motion: min_max action.control_mode: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 state_concat_order: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation action_concat_order: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 gr1_unified: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionSinCosTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist normalization_modes: action.left_arm: min_max action.right_arm: min_max action.left_hand: min_max action.right_hand: min_max action.waist: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_droid: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_position normalization_modes: state.eef_position: min_max state.gripper_position: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation_delta: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_position action_concat_order: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_fractal: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_pad_res256_freq03 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_pad_res256_freq03 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_pad_res256_freq03 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_pad_res256_freq03 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_pad_res256_freq03 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded normalization_modes: state.eef_position: min_max state.gripper_closedness_commanded: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.world_vector - action.rotation_delta - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.world_vector - action.rotation_delta - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.rotation_delta: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_pad_res256_freq03 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded action_concat_order: - action.world_vector - action.rotation_delta - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_language_table: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.rgb_pad_res256_freq10 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.rgb_pad_res256_freq10 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.rgb_pad_res256_freq10 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.rgb_pad_res256_freq10 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.rgb_pad_res256_freq10 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.effector_translation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.effector_translation normalization_modes: state.effector_translation: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.action - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.action normalization_modes: action.action: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.rgb_pad_res256_freq10 state_concat_order: - state.effector_translation action_concat_order: - action.action - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_bridge: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_0 - video.image_1 - video.image_2 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_0 - video.image_1 - video.image_2 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_0 - video.image_1 - video.image_2 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_0 - video.image_1 - video.image_2 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_0 - video.image_1 - video.image_2 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_closed normalization_modes: state.eef_position: min_max state.gripper_closed: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_0 - video.image_1 - video.image_2 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_closed action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 hot3d_hands_only: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation normalization_modes: state.left_wrist_position: min_max state.right_wrist_position: min_max target_rotations: state.left_wrist_rotation: quaternion state.right_wrist_rotation: quaternion - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation normalization_modes: action.left_wrist_position: min_max action.right_wrist_position: min_max target_rotations: action.left_wrist_rotation: quaternion action.right_wrist_rotation: quaternion - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view state_concat_order: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation action_concat_order: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 agibot: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.top_head - video.hand_left - video.hand_right - _target_: gr00t.data.transform.VideoCrop apply_to: - video.top_head - video.hand_left - video.hand_right scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.top_head - video.hand_left - video.hand_right height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.top_head - video.hand_left - video.hand_right brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.top_head - video.hand_left - video.hand_right - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position normalization_modes: state.left_arm_joint_position: min_max state.right_arm_joint_position: min_max state.left_effector_position: min_max state.right_effector_position: min_max state.head_position: min_max state.waist_position: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity normalization_modes: action.left_arm_joint_position: min_max action.right_arm_joint_position: min_max action.left_effector_position: min_max action.right_effector_position: min_max action.head_position: min_max action.waist_position: min_max action.robot_velocity: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.top_head - video.hand_left - video.hand_right state_concat_order: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position action_concat_order: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_mutex: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image - video.wrist_image scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image - video.wrist_image height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image - video.wrist_image brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.joint_angles - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.joint_angles - state.gripper_closed normalization_modes: state.joint_angles: min_max state.gripper_closed: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image - video.wrist_image state_concat_order: - state.joint_angles - state.gripper_closed action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_plex: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image - video.wrist_image scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image - video.wrist_image height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image - video.wrist_image brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.state - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.state normalization_modes: state.state: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image - video.wrist_image state_concat_order: - state.state action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_roboset: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_left - video.image_right - video.image_wrist - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_left - video.image_right - video.image_wrist scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_left - video.image_right - video.image_wrist height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_left - video.image_right - video.image_wrist brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_left - video.image_right - video.image_wrist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.joint_position - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.joint_position - state.gripper_closed normalization_modes: state.joint_position: min_max state.gripper_closed: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.joint_position - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.joint_position - action.gripper_position normalization_modes: action.joint_position: min_max action.gripper_position: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_left - video.image_right - video.image_wrist state_concat_order: - state.joint_position - state.gripper_closed action_concat_order: - action.joint_position - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 lapa: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 dream: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_bg_crop_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_bg_crop_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_bg_crop_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_bg_crop_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 gr1_unified_segmentation: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_bg_crop_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_bg_crop_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionSinCosTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.segmentation_target - action.segmentation_target_mask - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_bg_crop_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.segmentation_target - action.segmentation_target_mask - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 metadata_versions: robocasa_gr1_arms_only_fourier_hands: '0217' robocasa_gr1_fixed_lower_body_fourier_hands: '0217' robocasa_bimanual_panda_parallel_gripper: '0217' robocasa_bimanual_panda_inspire_hand: '0217' robocasa_panda_omron: '0217' gr1_unified: '0304' oxe_droid: '0221' oxe_fractal: '0221' oxe_language_table: '0221' oxe_bridge: '0221' robocasa_gr1_arms_waist_fourier_hands: '0225' hot3d_hands_only: '0220' agibot: '0306' oxe_mutex: '0303' oxe_plex: '0303' oxe_roboset: '0303' lapa: '0305' dream: '0308' gr1_unified_segmentation: '0309' dataset_kwargs: video_backend: decord use_global_metadata: true mixture_kwargs: training: true balance_dataset_weights: true seed: 42 shard_sampling_rate: 0.1 trainer: _target_: gr00t.experiment.dual_brain.experiment.DualBrainTrainer _partial_: true _recursive_: false callbacks: null model: ??? train_dataset: ??? compute_dtype: ??? benchmark_time: false enable_profiling: false profiling_steps: 5 wandb_project: dream_idm output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_gr1_idm_real_global_stats load_from_yaml: null gear_credentials: ??? upload_checkpoints: false upload_every: 10000 upload_last_n_checkpoints: 5 remove_unused_columns: false bf16: true tf32: true global_batch_size: null raise_error_if_global_batch_size_not_set: false per_device_train_batch_size: 64 per_device_eval_batch_size: 64 gradient_accumulation_steps: 1 dataloader_num_workers: 6 dataloader_pin_memory: false dataloader_persistent_workers: true optim: adamw_torch learning_rate: 0.0001 adam_beta1: 0.95 adam_beta2: 0.999 adam_epsilon: 1.0e-08 weight_decay: 1.0e-05 lr_scheduler_type: cosine warmup_ratio: 0.05 logging_steps: 10.0 num_train_epochs: 1000 max_steps: 60000 save_strategy: steps save_steps: 1000 eval_strategy: 'no' save_total_limit: 30 report_to: wandb seed: 21 do_eval: false gradient_checkpointing: false ddp_find_unused_parameters: false ddp_bucket_cap_mb: 100 ray_num_workers: ??? eval_bf16: true torch_compile_mode: null pretrained_model_path: null only_tune_projectors: false training_args: _target_: transformers.TrainingArguments output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_gr1_idm_real_global_stats run_name: gr00t_s_gr1_idm_real_global_stats remove_unused_columns: false deepspeed: gr00t/experiment/dual_brain/configs/deepspeed/zero2.json gradient_checkpointing: false bf16: true tf32: true per_device_train_batch_size: 64 per_device_eval_batch_size: 64 gradient_accumulation_steps: 1 dataloader_num_workers: 6 dataloader_pin_memory: false dataloader_persistent_workers: true optim: adamw_torch adam_beta1: 0.95 adam_beta2: 0.999 adam_epsilon: 1.0e-08 learning_rate: 0.0001 weight_decay: 1.0e-05 warmup_ratio: 0.05 lr_scheduler_type: cosine logging_steps: 10.0 num_train_epochs: 1000 max_steps: 60000 save_strategy: steps save_steps: 1000 save_total_limit: 30 report_to: wandb seed: 21 do_eval: false ddp_find_unused_parameters: false ddp_bucket_cap_mb: 100 torch_compile_mode: null add_seperator_token: true add_pos_embed: true hidden_size: 1024 attn_dropout: 0.2 siglip_hidden_size: 1024 siglip_version: google/siglip2-large-patch16-256 action_head_cfg: _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM _convert_: object config: _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig _recursive_: false add_seperator_token: true add_pos_embed: true model_dtype: float32 mm_vision_select_layer: -2 max_state_dim: 44 max_action_dim: 32 hidden_size: 1024 tune_vision_tower: true add_view_embed: true max_num_views: 3 siglip_model_cfg: _target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 siglip_hidden_size: 1024 vl_self_attention_cfg: _target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer positional_embeddings: null num_layers: 4 num_attention_heads: 16 attention_head_dim: 64 dropout: 0.2 final_dropout: true diffusion_model_cfg: _target_: gr00t.model.action_head.cross_attention_dit.DiT positional_embeddings: null num_layers: 8 num_attention_heads: 16 attention_head_dim: 64 norm_type: ada_norm dropout: 0.2 final_dropout: true output_dim: 1024 interleave_self_attention: true mm_projector_cfg: _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector _convert_: object config: _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig hidden_size: 1024 mm_hidden_size: 1024 mm_projector_type: mlp_doubledownsample action_dim: 32 action_horizon: 16 num_inference_timesteps: 16 noise_beta_alpha: 1.5 noise_beta_beta: 1.0 noise_s: 0.999 num_timestep_buckets: 1000 backbone_features_projector_cfg: null backbone_hidden_size: 0 backbone_cfg: _target_: gr00t.model.backbone.IdentityBackbone embodiment_tag_to_projector_index: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 num_visual_tokens_per_frame: 16 max_action_dim: 32 language_dropout_prob: 0.0 model_image_resolution: 224 max_sequence_length: 112 model_specific_transform: _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 data_collator: _target_: gr00t.model.transforms_idm.DefaultDataCollatorGR00TIDM use_global_metadata: true action_horizon: 16 state_horizon: 1 image_resolution: 224 totensor_cfg: _target_: gr00t.data.transform.VideoToTensor apply_to: ??? crop_cfg: _target_: gr00t.data.transform.VideoCrop apply_to: ??? scale: 0.95 mode: random resize_cfg: _target_: gr00t.data.transform.VideoResize apply_to: ??? height: 224 width: 224 interpolation: linear color_jitter_cfg: _target_: gr00t.data.transform.VideoColorJitter apply_to: ??? brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 random_grayscale_cfg: _target_: gr00t.data.transform.VideoRandomGrayscale apply_to: ??? p: 0.1 random_posterize_cfg: _target_: gr00t.data.transform.VideoRandomPosterize apply_to: ??? bits: 4 p: 0.1 to_numpy_cfg: _target_: gr00t.data.transform.VideoToNumpy apply_to: ??? modality_config_robocasa_gr1_arms_only_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_robocasa_gr1_arms_only_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_robocasa_gr1_arms_waist_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_robocasa_gr1_arms_waist_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max state.waist: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max action.waist: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_robocasa_panda_omron: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_robocasa_panda_omron: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation normalization_modes: state.end_effector_position_relative: min_max state.end_effector_rotation_relative: min_max state.gripper_qpos: min_max state.base_position: min_max state.base_rotation: min_max target_rotations: state.end_effector_rotation_relative: rotation_6d state.base_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode normalization_modes: action.end_effector_position: min_max action.end_effector_rotation: min_max action.gripper_close: binary action.base_motion: min_max action.control_mode: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 state_concat_order: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation action_concat_order: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_robocasa_gr1_fixed_lower_body_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_robocasa_gr1_fixed_lower_body_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max state.waist: min_max state.neck: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max action.waist: min_max action.neck: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.agentview_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_robocasa_bimanual_panda_parallel_gripper: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_robocasa_bimanual_panda_parallel_gripper: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos normalization_modes: state.right_arm_eef_pos: min_max state.right_gripper_qpos: min_max state.left_arm_eef_pos: min_max state.left_gripper_qpos: min_max target_rotations: state.right_arm_eef_quat: rotation_6d state.left_arm_eef_quat: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close normalization_modes: action.right_gripper_close: binary action.left_gripper_close: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state_concat_order: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos action_concat_order: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_robocasa_bimanual_panda_inspire_hand: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_robocasa_bimanual_panda_inspire_hand: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand normalization_modes: state.right_arm_eef_pos: min_max state.right_hand: min_max state.left_arm_eef_pos: min_max state.left_hand: min_max target_rotations: state.right_arm_eef_quat: rotation_6d state.left_arm_eef_quat: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand normalization_modes: action.right_hand: min_max action.left_hand: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state_concat_order: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand action_concat_order: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_gr1_unified_segmentation: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_bg_crop_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.segmentation_target - action.segmentation_target_mask language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_gr1_unified_segmentation: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_bg_crop_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_bg_crop_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionSinCosTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.segmentation_target - action.segmentation_target_mask - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_bg_crop_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.segmentation_target - action.segmentation_target_mask - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_gr1_unified: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_bg_crop_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_gr1_unified: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionSinCosTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist normalization_modes: action.left_arm: min_max action.right_arm: min_max action.left_hand: min_max action.right_hand: min_max action.waist: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_oxe_droid: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_position action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_oxe_droid: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_position normalization_modes: state.eef_position: min_max state.gripper_position: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation_delta: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_position action_concat_order: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_oxe_fractal: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_pad_res256_freq03 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.world_vector - action.rotation_delta - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.natural_language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_oxe_fractal: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_pad_res256_freq03 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_pad_res256_freq03 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_pad_res256_freq03 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_pad_res256_freq03 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_pad_res256_freq03 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded normalization_modes: state.eef_position: min_max state.gripper_closedness_commanded: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.world_vector - action.rotation_delta - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.world_vector - action.rotation_delta - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.rotation_delta: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_pad_res256_freq03 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded action_concat_order: - action.world_vector - action.rotation_delta - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_oxe_language_table: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.rgb_pad_res256_freq10 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.effector_translation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.action language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_oxe_language_table: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.rgb_pad_res256_freq10 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.rgb_pad_res256_freq10 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.rgb_pad_res256_freq10 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.rgb_pad_res256_freq10 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.rgb_pad_res256_freq10 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.effector_translation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.effector_translation normalization_modes: state.effector_translation: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.action - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.action normalization_modes: action.action: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.rgb_pad_res256_freq10 state_concat_order: - state.effector_translation action_concat_order: - action.action - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_oxe_bridge: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_0 - video.image_1 - video.image_2 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_oxe_bridge: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_0 - video.image_1 - video.image_2 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_0 - video.image_1 - video.image_2 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_0 - video.image_1 - video.image_2 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_0 - video.image_1 - video.image_2 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_0 - video.image_1 - video.image_2 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_closed normalization_modes: state.eef_position: min_max state.gripper_closed: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_0 - video.image_1 - video.image_2 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_closed action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_hot3d_hands_only: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_hot3d_hands_only: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation normalization_modes: state.left_wrist_position: min_max state.right_wrist_position: min_max target_rotations: state.left_wrist_rotation: quaternion state.right_wrist_rotation: quaternion - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation normalization_modes: action.left_wrist_position: min_max action.right_wrist_position: min_max target_rotations: action.left_wrist_rotation: quaternion action.right_wrist_rotation: quaternion - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view state_concat_order: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation action_concat_order: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_agibot: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.top_head - video.hand_left - video.hand_right state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.agibot.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_agibot: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.top_head - video.hand_left - video.hand_right - _target_: gr00t.data.transform.VideoCrop apply_to: - video.top_head - video.hand_left - video.hand_right scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.top_head - video.hand_left - video.hand_right height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.top_head - video.hand_left - video.hand_right brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.top_head - video.hand_left - video.hand_right - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position normalization_modes: state.left_arm_joint_position: min_max state.right_arm_joint_position: min_max state.left_effector_position: min_max state.right_effector_position: min_max state.head_position: min_max state.waist_position: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity normalization_modes: action.left_arm_joint_position: min_max action.right_arm_joint_position: min_max action.left_effector_position: min_max action.right_effector_position: min_max action.head_position: min_max action.waist_position: min_max action.robot_velocity: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.top_head - video.hand_left - video.hand_right state_concat_order: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position action_concat_order: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_oxe_mutex: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image - video.wrist_image state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.joint_angles - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_oxe_mutex: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image - video.wrist_image scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image - video.wrist_image height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image - video.wrist_image brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.joint_angles - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.joint_angles - state.gripper_closed normalization_modes: state.joint_angles: min_max state.gripper_closed: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image - video.wrist_image state_concat_order: - state.joint_angles - state.gripper_closed action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_oxe_plex: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image - video.wrist_image state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.state action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_oxe_plex: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image - video.wrist_image scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image - video.wrist_image height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image - video.wrist_image brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.state - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.state normalization_modes: state.state: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image - video.wrist_image state_concat_order: - state.state action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_oxe_roboset: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_left - video.image_right - video.image_wrist state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.joint_position - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.joint_position - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_oxe_roboset: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_left - video.image_right - video.image_wrist - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_left - video.image_right - video.image_wrist scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_left - video.image_right - video.image_wrist height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_left - video.image_right - video.image_wrist brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_left - video.image_right - video.image_wrist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.joint_position - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.joint_position - state.gripper_closed normalization_modes: state.joint_position: min_max state.gripper_closed: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.joint_position - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.joint_position - action.gripper_position normalization_modes: action.joint_position: min_max action.gripper_position: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_left - video.image_right - video.image_wrist state_concat_order: - state.joint_position - state.gripper_closed action_concat_order: - action.joint_position - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_lapa: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_lapa: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_config_dream: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_bg_crop_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transform_dream: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_bg_crop_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_bg_crop_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_bg_crop_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_bg_crop_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 modality_configs: robocasa_gr1_arms_only_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_gr1_arms_waist_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_gr1_fixed_lower_body_fourier_hands: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_bimanual_panda_parallel_gripper: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_bimanual_panda_inspire_hand: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions robocasa_panda_omron: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions gr1_unified: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_droid: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_position action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_fractal: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_pad_res256_freq03 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.world_vector - action.rotation_delta - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.natural_language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_language_table: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.rgb_pad_res256_freq10 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.effector_translation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.action language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_bridge: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_0 - video.image_1 - video.image_2 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.eef_position - state.eef_rotation - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_mutex: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image - video.wrist_image state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.joint_angles - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_plex: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image - video.wrist_image state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.state action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.eef_position - action.eef_rotation - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions oxe_roboset: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.image_left - video.image_right - video.image_wrist state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.joint_position - state.gripper_closed action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.joint_position - action.gripper_position language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.language.language_instruction lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions hot3d_hands_only: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions agibot: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.top_head - video.hand_left - video.hand_right state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.agibot.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions lapa: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.action.task_description lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions dream: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_bg_crop_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions gr1_unified_segmentation: video: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 16 modality_keys: - video.ego_view_bg_crop_pad_res256_freq20 state: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 modality_keys: - action.segmentation_target - action.segmentation_target_mask language: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - annotation.human.coarse_action lapa_action: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - lapa_action dream_actions: _target_: gr00t.data.dataset.ModalityConfig delta_indices: - 0 modality_keys: - dream_actions transforms: robocasa_gr1_arms_only_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_gr1_arms_waist_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max state.waist: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max action.waist: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_gr1_fixed_lower_body_fourier_hands: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck normalization_modes: state.left_arm: min_max state.right_arm: min_max state.left_hand: min_max state.right_hand: min_max state.waist: min_max state.neck: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck normalization_modes: action.right_arm: min_max action.left_arm: min_max action.right_hand: min_max action.left_hand: min_max action.waist: min_max action.neck: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.agentview_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - state.neck action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - action.neck - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_bimanual_panda_parallel_gripper: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos normalization_modes: state.right_arm_eef_pos: min_max state.right_gripper_qpos: min_max state.left_arm_eef_pos: min_max state.left_gripper_qpos: min_max target_rotations: state.right_arm_eef_quat: rotation_6d state.left_arm_eef_quat: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close normalization_modes: action.right_gripper_close: binary action.left_gripper_close: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state_concat_order: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_gripper_qpos - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_gripper_qpos action_concat_order: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_gripper_close - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_gripper_close - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_bimanual_panda_inspire_hand: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand normalization_modes: state.right_arm_eef_pos: min_max state.right_hand: min_max state.left_arm_eef_pos: min_max state.left_hand: min_max target_rotations: state.right_arm_eef_quat: rotation_6d state.left_arm_eef_quat: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand normalization_modes: action.right_hand: min_max action.left_hand: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.robot0_eye_in_hand_pad_res256_freq20 - video.robot1_eye_in_hand_pad_res256_freq20 - video.agentview_pad_res256_freq20 state_concat_order: - state.right_arm_eef_pos - state.right_arm_eef_quat - state.right_hand - state.left_arm_eef_pos - state.left_arm_eef_quat - state.left_hand action_concat_order: - action.right_arm_eef_pos - action.right_arm_eef_rot - action.right_hand - action.left_arm_eef_pos - action.left_arm_eef_rot - action.left_hand - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 robocasa_panda_omron: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation normalization_modes: state.end_effector_position_relative: min_max state.end_effector_rotation_relative: min_max state.gripper_qpos: min_max state.base_position: min_max state.base_rotation: min_max target_rotations: state.end_effector_rotation_relative: rotation_6d state.base_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode normalization_modes: action.end_effector_position: min_max action.end_effector_rotation: min_max action.gripper_close: binary action.base_motion: min_max action.control_mode: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.res256_image_side_0 - video.res256_image_side_1 - video.res256_image_wrist_0 state_concat_order: - state.end_effector_position_relative - state.end_effector_rotation_relative - state.gripper_qpos - state.base_position - state.base_rotation action_concat_order: - action.end_effector_position - action.end_effector_rotation - action.gripper_close - action.base_motion - action.control_mode - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 gr1_unified: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionSinCosTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist normalization_modes: action.left_arm: min_max action.right_arm: min_max action.left_hand: min_max action.right_hand: min_max action.waist: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_bg_crop_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_droid: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_position normalization_modes: state.eef_position: min_max state.gripper_position: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation_delta: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.exterior_image_1_left_pad_res256_freq15 - video.exterior_image_2_left_pad_res256_freq15 - video.wrist_image_left_pad_res256_freq15 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_position action_concat_order: - action.eef_position_delta - action.eef_rotation_delta - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_fractal: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_pad_res256_freq03 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_pad_res256_freq03 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_pad_res256_freq03 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_pad_res256_freq03 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_pad_res256_freq03 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded normalization_modes: state.eef_position: min_max state.gripper_closedness_commanded: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.world_vector - action.rotation_delta - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.world_vector - action.rotation_delta - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.rotation_delta: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_pad_res256_freq03 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_closedness_commanded action_concat_order: - action.world_vector - action.rotation_delta - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_language_table: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.rgb_pad_res256_freq10 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.rgb_pad_res256_freq10 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.rgb_pad_res256_freq10 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.rgb_pad_res256_freq10 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.rgb_pad_res256_freq10 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.effector_translation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.effector_translation normalization_modes: state.effector_translation: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.action - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.action normalization_modes: action.action: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.rgb_pad_res256_freq10 state_concat_order: - state.effector_translation action_concat_order: - action.action - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_bridge: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_0 - video.image_1 - video.image_2 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_0 - video.image_1 - video.image_2 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_0 - video.image_1 - video.image_2 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_0 - video.image_1 - video.image_2 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_0 - video.image_1 - video.image_2 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.eef_position - state.eef_rotation - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.eef_position - state.eef_rotation - state.gripper_closed normalization_modes: state.eef_position: min_max state.gripper_closed: min_max target_rotations: state.eef_rotation: rotation_6d - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_0 - video.image_1 - video.image_2 state_concat_order: - state.eef_position - state.eef_rotation - state.gripper_closed action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 hot3d_hands_only: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation normalization_modes: state.left_wrist_position: min_max state.right_wrist_position: min_max target_rotations: state.left_wrist_rotation: quaternion state.right_wrist_rotation: quaternion - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation normalization_modes: action.left_wrist_position: min_max action.right_wrist_position: min_max target_rotations: action.left_wrist_rotation: quaternion action.right_wrist_rotation: quaternion - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view state_concat_order: - state.left_wrist_position - state.left_wrist_rotation - state.left_joint_rotation - state.right_wrist_position - state.right_wrist_rotation - state.right_joint_rotation action_concat_order: - action.left_wrist_position - action.left_wrist_rotation - action.left_joint_rotation - action.right_wrist_position - action.right_wrist_rotation - action.right_joint_rotation - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 agibot: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.top_head - video.hand_left - video.hand_right - _target_: gr00t.data.transform.VideoCrop apply_to: - video.top_head - video.hand_left - video.hand_right scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.top_head - video.hand_left - video.hand_right height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.top_head - video.hand_left - video.hand_right brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.top_head - video.hand_left - video.hand_right - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position normalization_modes: state.left_arm_joint_position: min_max state.right_arm_joint_position: min_max state.left_effector_position: min_max state.right_effector_position: min_max state.head_position: min_max state.waist_position: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity normalization_modes: action.left_arm_joint_position: min_max action.right_arm_joint_position: min_max action.left_effector_position: min_max action.right_effector_position: min_max action.head_position: min_max action.waist_position: min_max action.robot_velocity: min_max - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.top_head - video.hand_left - video.hand_right state_concat_order: - state.left_arm_joint_position - state.right_arm_joint_position - state.left_effector_position - state.right_effector_position - state.head_position - state.waist_position action_concat_order: - action.left_arm_joint_position - action.right_arm_joint_position - action.left_effector_position - action.right_effector_position - action.head_position - action.waist_position - action.robot_velocity - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_mutex: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image - video.wrist_image scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image - video.wrist_image height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image - video.wrist_image brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.joint_angles - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.joint_angles - state.gripper_closed normalization_modes: state.joint_angles: min_max state.gripper_closed: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image - video.wrist_image state_concat_order: - state.joint_angles - state.gripper_closed action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_plex: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image - video.wrist_image scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image - video.wrist_image height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image - video.wrist_image brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image - video.wrist_image - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.state - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.state normalization_modes: state.state: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.eef_position - action.eef_rotation - action.gripper_position normalization_modes: action.gripper_position: binary target_rotations: action.eef_rotation: axis_angle - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image - video.wrist_image state_concat_order: - state.state action_concat_order: - action.eef_position - action.eef_rotation - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 oxe_roboset: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.image_left - video.image_right - video.image_wrist - _target_: gr00t.data.transform.VideoCrop apply_to: - video.image_left - video.image_right - video.image_wrist scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.image_left - video.image_right - video.image_wrist height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.image_left - video.image_right - video.image_wrist brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.image_left - video.image_right - video.image_wrist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.joint_position - state.gripper_closed - _target_: gr00t.data.transform.StateActionTransform apply_to: - state.joint_position - state.gripper_closed normalization_modes: state.joint_position: min_max state.gripper_closed: min_max - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.joint_position - action.gripper_position - _target_: gr00t.data.transform.StateActionTransform apply_to: - action.joint_position - action.gripper_position normalization_modes: action.joint_position: min_max action.gripper_position: binary - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.image_left - video.image_right - video.image_wrist state_concat_order: - state.joint_position - state.gripper_closed action_concat_order: - action.joint_position - action.gripper_position - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 lapa: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 dream: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.VideoCrop apply_to: - video.ego_view_bg_crop_pad_res256_freq20 scale: 0.95 mode: random - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_bg_crop_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_bg_crop_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_bg_crop_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.left_arm - action.right_arm - action.left_hand - action.right_hand - action.waist - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 gr1_unified_segmentation: _target_: gr00t.data.transform.ComposedModalityTransform transforms: - _target_: gr00t.data.transform.VideoToTensor apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.VideoResize apply_to: - video.ego_view_bg_crop_pad_res256_freq20 height: 224 width: 224 interpolation: linear - _target_: gr00t.data.transform.VideoColorJitter apply_to: - video.ego_view_bg_crop_pad_res256_freq20 brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.08 - _target_: gr00t.data.transform.VideoToNumpy apply_to: - video.ego_view_bg_crop_pad_res256_freq20 - _target_: gr00t.data.transform.StateActionToTensor apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionSinCosTransform apply_to: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist - _target_: gr00t.data.transform.StateActionToTensor apply_to: - action.segmentation_target - action.segmentation_target_mask - _target_: gr00t.data.transform.ConcatTransform video_concat_order: - video.ego_view_bg_crop_pad_res256_freq20 state_concat_order: - state.left_arm - state.right_arm - state.left_hand - state.right_hand - state.waist action_concat_order: - action.segmentation_target - action.segmentation_target_mask - _target_: gr00t.model.transforms_idm.GR00TIDMTransform default_instruction: Perform the default behavior. num_visual_tokens_per_frame: 16 max_num_images_per_sequence: 6 max_action_dim: 32 max_sequence_length: 112 action_horizon: 16 siglip_processor: _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained _convert_: object pretrained_model_name_or_path: google/siglip2-large-patch16-256 embodiment_tag_mapping: real_gr1_arms_only: 0 real_gr1_arms_only_annotated: 1 real_gr1_arms_waist: 2 real_gr1_arms_waist_annotated: 3 dexmg_gr1_arms_only_inspire: 4 dexmg_gr1_arms_only_fourier: 5 dexmg_gr1_arms_waist_fourier: 6 robocasa_single_arm: 7 onex_eve_gripper: 8 robocasa_gr1_arms_only_inspire_hands: 9 robocasa_gr1_arms_only_fourier_hands: 10 robocasa_gr1_fixed_lower_body_inspire_hands: 11 robocasa_gr1_fixed_lower_body_fourier_hands: 12 robocasa_panda_omron: 13 robocasa_bimanual_panda_parallel_gripper: 15 robocasa_bimanual_panda_inspire_hand: 16 oxe_droid: 17 oxe_fractal: 18 oxe_language_table: 19 oxe_bridge: 20 real_panda_single_arm: 21 unknown: 22 hot3d_hands_only: 23 gr1_unified: 24 robocasa_gr1_arms_waist_fourier_hands: 25 lapa: 27 oxe_mutex: 28 oxe_roboset: 29 oxe_plex: 30 dream: 31 gr1_unified_segmentation: 14 metadata_versions: robocasa_gr1_arms_only_fourier_hands: '0217' robocasa_gr1_fixed_lower_body_fourier_hands: '0217' robocasa_bimanual_panda_parallel_gripper: '0217' robocasa_bimanual_panda_inspire_hand: '0217' robocasa_panda_omron: '0217' gr1_unified: '0304' oxe_droid: '0221' oxe_fractal: '0221' oxe_language_table: '0221' oxe_bridge: '0221' robocasa_gr1_arms_waist_fourier_hands: '0225' hot3d_hands_only: '0220' agibot: '0306' oxe_mutex: '0303' oxe_plex: '0303' oxe_roboset: '0303' lapa: '0305' dream: '0308' gr1_unified_segmentation: '0309' max_state_dim: 44 dataset_shard_sampling_rate: 0.1 mixture_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec single_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset data_root: /mnt/amlfs-03/shared/datasets gr00t_commit_hash: 83c31e50e727eb21e80857dea54541752d89811f total_training_steps: 16384000000