Envs API

Documentation for simulation environments and task definitions.

embodied_gen.envs.pick_embodiedgen

PickEmbodiedGen

PickEmbodiedGen(*args, robot_uids: str | list[str] = 'panda', robot_init_qpos_noise: float = 0.02, num_envs: int = 1, reconfiguration_freq: int = None, **kwargs)

Bases: BaseEnv

PickEmbodiedGen as gym env example for object pick-and-place tasks.

This environment simulates a robot interacting with 3D assets in the embodiedgen generated scene in SAPIEN. It supports multi-environment setups, dynamic reconfiguration, and hybrid rendering with 3D Gaussian Splatting.

Example

Use gym.make to create the PickEmbodiedGen-v1 parallel environment.

import gymnasium as gym
env = gym.make(
    "PickEmbodiedGen-v1",
    num_envs=cfg.num_envs,
    render_mode=cfg.render_mode,
    enable_shadow=cfg.enable_shadow,
    layout_file=cfg.layout_file,
    control_mode=cfg.control_mode,
    camera_cfg=dict(
        camera_eye=cfg.camera_eye,
        camera_target_pt=cfg.camera_target_pt,
        image_hw=cfg.image_hw,
        fovy_deg=cfg.fovy_deg,
    ),
)

Initializes the PickEmbodiedGen environment.

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list for the base class.	`()`
`robot_uids`	`str \| list[str]`	The robot(s) to use in the environment.	`'panda'`
`robot_init_qpos_noise`	`float`	Noise added to the robot's initial joint positions.	`0.02`
`num_envs`	`int`	The number of parallel environments to create.	`1`
`reconfiguration_freq`	`int`	How often to reconfigure the scene. If None, it is set based on num_envs.	`None`
`**kwargs`		Additional keyword arguments for environment setup, including layout_file, replace_objs, enable_grasp, etc.	`{}`

Source code in embodied_gen/envs/pick_embodiedgen.py

def __init__(
    self,
    *args,
    robot_uids: str | list[str] = "panda",
    robot_init_qpos_noise: float = 0.02,
    num_envs: int = 1,
    reconfiguration_freq: int = None,
    **kwargs,
):
    """Initializes the PickEmbodiedGen environment.

    Args:
        *args: Variable length argument list for the base class.
        robot_uids: The robot(s) to use in the environment.
        robot_init_qpos_noise: Noise added to the robot's initial joint
            positions.
        num_envs: The number of parallel environments to create.
        reconfiguration_freq: How often to reconfigure the scene. If None,
            it is set based on num_envs.
        **kwargs: Additional keyword arguments for environment setup,
            including layout_file, replace_objs, enable_grasp, etc.
    """
    self.robot_init_qpos_noise = robot_init_qpos_noise
    if reconfiguration_freq is None:
        if num_envs == 1:
            reconfiguration_freq = 1
        else:
            reconfiguration_freq = 0

    # Init params from kwargs.
    layout_file = kwargs.pop("layout_file", None)
    replace_objs = kwargs.pop("replace_objs", True)
    self.enable_grasp = kwargs.pop("enable_grasp", False)
    self.init_3dgs_quat = kwargs.pop(
        "init_3dgs_quat", [0.7071, 0, 0, 0.7071]
    )
    # Add small offset in z-axis to avoid collision.
    self.objs_z_offset = kwargs.pop("objs_z_offset", 0.002)
    self.robot_z_offset = kwargs.pop("robot_z_offset", 0.002)
    self.camera_cfg = kwargs.pop("camera_cfg", None)
    if self.camera_cfg is None:
        self.camera_cfg = dict(
            camera_eye=[0.9, 0.0, 1.1],
            camera_target_pt=[0.0, 0.0, 0.9],
            image_hw=[256, 256],
            fovy_deg=75,
        )

    self.layouts = self.init_env_layouts(
        layout_file, num_envs, replace_objs
    )
    self.robot_pose = self.compute_robot_init_pose(
        self.layouts, num_envs, self.robot_z_offset
    )
    self.env_actors = dict()
    self.image_transform = transforms.PILToTensor()

    super().__init__(
        *args,
        robot_uids=robot_uids,
        reconfiguration_freq=reconfiguration_freq,
        num_envs=num_envs,
        **kwargs,
    )

    self.bg_images = dict()
    if self.render_mode == "hybrid":
        self.bg_images = self.render_gs3d_images(
            self.layouts, num_envs, self.init_3dgs_quat
        )

compute_dense_reward

compute_dense_reward(obs: any, action: Tensor, info: dict)

Computes a dense reward for the current step.

The reward is a composite of reaching, grasping, placing, and maintaining a static final pose.

Parameters:

Name	Type	Description	Default
`obs`	`any`	The current observation.	required
`action`	`Tensor`	The action taken in the current step.	required
`info`	`dict`	A dictionary containing evaluation information from `evaluate()`.	required

Returns:

Type	Description
	A tensor containing the dense reward for each environment.

Source code in embodied_gen/envs/pick_embodiedgen.py

def compute_dense_reward(self, obs: any, action: torch.Tensor, info: dict):
    """Computes a dense reward for the current step.

    The reward is a composite of reaching, grasping, placing, and
    maintaining a static final pose.

    Args:
        obs: The current observation.
        action: The action taken in the current step.
        info: A dictionary containing evaluation information from `evaluate()`.

    Returns:
        A tensor containing the dense reward for each environment.
    """
    tcp_to_obj_dist = torch.linalg.norm(
        self.obj.pose.p - self.agent.tcp.pose.p, axis=1
    )
    reaching_reward = 1 - torch.tanh(5 * tcp_to_obj_dist)
    reward = reaching_reward

    is_grasped = info["is_grasped"]
    reward += is_grasped

    # obj_to_goal_dist = torch.linalg.norm(
    #     self.goal_site.pose.p - self.obj.pose.p, axis=1
    # )
    obj_to_goal_dist = torch.linalg.norm(
        self.obj.pose.p - self.obj.pose.p, axis=1
    )
    place_reward = 1 - torch.tanh(5 * obj_to_goal_dist)
    reward += place_reward * is_grasped

    reward += info["is_obj_placed"] * is_grasped

    static_reward = 1 - torch.tanh(
        5
        * torch.linalg.norm(self.agent.robot.get_qvel()[..., :-2], axis=1)
    )
    reward += static_reward * info["is_obj_placed"] * is_grasped

    reward[info["success"]] = 6
    return reward

compute_normalized_dense_reward

compute_normalized_dense_reward(obs: any, action: Tensor, info: dict)

Computes a dense reward normalized to be between 0 and 1.

Parameters:

Name	Type	Description	Default
`obs`	`any`	The current observation.	required
`action`	`Tensor`	The action taken in the current step.	required
`info`	`dict`	A dictionary containing evaluation information from `evaluate()`.	required

Returns:

Type	Description
	A tensor containing the normalized dense reward for each environment.

Source code in embodied_gen/envs/pick_embodiedgen.py

def compute_normalized_dense_reward(
    self, obs: any, action: torch.Tensor, info: dict
):
    """Computes a dense reward normalized to be between 0 and 1.

    Args:
        obs: The current observation.
        action: The action taken in the current step.
        info: A dictionary containing evaluation information from `evaluate()`.

    Returns:
        A tensor containing the normalized dense reward for each environment.
    """
    return self.compute_dense_reward(obs=obs, action=action, info=info) / 6

compute_robot_init_pose `staticmethod`

compute_robot_init_pose(layouts: list[str], num_envs: int, z_offset: float = 0.0) -> list[list[float]]

Computes the initial pose for the robot in each environment.

Parameters:

Name	Type	Description	Default
`layouts`	`list[str]`	A list of file paths to the environment layouts.	required
`num_envs`	`int`	The number of environments.	required
`z_offset`	`float`	An optional vertical offset to apply to the robot's position to prevent collisions.	`0.0`

Returns:

Type	Description
`list[list[float]]`	A list of initial poses ([x, y, z, qw, qx, qy, qz]) for the robot
`list[list[float]]`	in each environment.

Source code in embodied_gen/envs/pick_embodiedgen.py

@staticmethod
def compute_robot_init_pose(
    layouts: list[str], num_envs: int, z_offset: float = 0.0
) -> list[list[float]]:
    """Computes the initial pose for the robot in each environment.

    Args:
        layouts: A list of file paths to the environment layouts.
        num_envs: The number of environments.
        z_offset: An optional vertical offset to apply to the robot's
            position to prevent collisions.

    Returns:
        A list of initial poses ([x, y, z, qw, qx, qy, qz]) for the robot
        in each environment.
    """
    robot_pose = []
    for env_idx in range(num_envs):
        layout = json.load(open(layouts[env_idx], "r"))
        layout = LayoutInfo.from_dict(layout)
        robot_node = layout.relation[Scene3DItemEnum.ROBOT.value]
        x, y, z, qx, qy, qz, qw = layout.position[robot_node]
        robot_pose.append([x, y, z + z_offset, qw, qx, qy, qz])

    return robot_pose

evaluate

evaluate()

Evaluates the current state of the environment.

Checks for task success criteria such as whether the object is grasped, placed at the goal, and if the robot is static.

Returns:

Type	Description
	A dictionary containing boolean tensors for various success
	metrics, including 'is_grasped', 'is_obj_placed', and overall
	'success'.

Source code in embodied_gen/envs/pick_embodiedgen.py

def evaluate(self):
    """Evaluates the current state of the environment.

    Checks for task success criteria such as whether the object is grasped,
    placed at the goal, and if the robot is static.

    Returns:
        A dictionary containing boolean tensors for various success
        metrics, including 'is_grasped', 'is_obj_placed', and overall
        'success'.
    """
    obj_to_goal_pos = (
        self.obj.pose.p
    )  # self.goal_site.pose.p - self.obj.pose.p
    is_obj_placed = (
        torch.linalg.norm(obj_to_goal_pos, axis=1) <= self.goal_thresh
    )
    is_grasped = self.agent.is_grasping(self.obj)
    is_robot_static = self.agent.is_static(0.2)

    return dict(
        is_grasped=is_grasped,
        obj_to_goal_pos=obj_to_goal_pos,
        is_obj_placed=is_obj_placed,
        is_robot_static=is_robot_static,
        is_grasping=self.agent.is_grasping(self.obj),
        success=torch.logical_and(is_obj_placed, is_robot_static),
    )

hybrid_render

hybrid_render()

Renders a hybrid image by blending simulated foreground with a background.

The foreground is rendered with an alpha channel and then blended with the pre-rendered Gaussian Splatting background image.

Returns:

Type	Description
	A torch tensor of the final blended RGB images.

Source code in embodied_gen/envs/pick_embodiedgen.py

def hybrid_render(self):
    """Renders a hybrid image by blending simulated foreground with a background.

    The foreground is rendered with an alpha channel and then blended with
    the pre-rendered Gaussian Splatting background image.

    Returns:
        A torch tensor of the final blended RGB images.
    """
    fg_images = self.render_rgb_array(
        return_alpha=True
    )  # (n_env, h, w, 3)
    images = []
    for key in self.bg_images:
        if "render_camera" not in key:
            continue
        env_idx = int(key.split("-env")[-1])
        rgba = alpha_blend_rgba(
            fg_images[env_idx].cpu().numpy(), self.bg_images[key]
        )
        images.append(self.image_transform(rgba))

    images = torch.stack(images, dim=0)
    images = images.permute(0, 2, 3, 1)

    return images[..., :3]

init_env_layouts `staticmethod`

init_env_layouts(layout_file: str, num_envs: int, replace_objs: bool) -> list[LayoutInfo]

Initializes and saves layout files for each environment instance.

For each environment, this method creates a layout configuration. If replace_objs is True, it generates new object placements for each subsequent environment. The generated layouts are saved as new JSON files.

Parameters:

Name	Type	Description	Default
`layout_file`	`str`	Path to the base layout JSON file.	required
`num_envs`	`int`	The number of environments to create layouts for.	required
`replace_objs`	`bool`	If True, generates new object placements for each environment after the first one using BFS placement.	required

Returns:

Type	Description
`list[LayoutInfo]`	A list of file paths to the generated layout for each environment.

Source code in embodied_gen/envs/pick_embodiedgen.py

@staticmethod
def init_env_layouts(
    layout_file: str, num_envs: int, replace_objs: bool
) -> list[LayoutInfo]:
    """Initializes and saves layout files for each environment instance.

    For each environment, this method creates a layout configuration. If
    `replace_objs` is True, it generates new object placements for each
    subsequent environment. The generated layouts are saved as new JSON
    files.

    Args:
        layout_file: Path to the base layout JSON file.
        num_envs: The number of environments to create layouts for.
        replace_objs: If True, generates new object placements for each
            environment after the first one using BFS placement.

    Returns:
        A list of file paths to the generated layout for each environment.
    """
    layouts = []
    for env_idx in range(num_envs):
        if replace_objs and env_idx > 0:
            layout_info = bfs_placement(layout_file)
        else:
            layout_info = json.load(open(layout_file, "r"))
            layout_info = LayoutInfo.from_dict(layout_info)

        layout_path = layout_file.replace(".json", f"_env{env_idx}.json")
        with open(layout_path, "w") as f:
            json.dump(layout_info.to_dict(), f, indent=4)

        layouts.append(layout_path)

    return layouts

render

render()

Renders the environment based on the configured render_mode.

Raises:

Type	Description
`RuntimeError`	If `render_mode` is not set.
`NotImplementedError`	If the `render_mode` is not supported.

Returns:

Type	Description
	The rendered output, which varies depending on the render mode.

Source code in embodied_gen/envs/pick_embodiedgen.py

def render(self):
    """Renders the environment based on the configured render_mode.

    Raises:
        RuntimeError: If `render_mode` is not set.
        NotImplementedError: If the `render_mode` is not supported.

    Returns:
        The rendered output, which varies depending on the render mode.
    """
    if self.render_mode is None:
        raise RuntimeError("render_mode is not set.")
    if self.render_mode == "human":
        return self.render_human()
    elif self.render_mode == "rgb_array":
        res = self.render_rgb_array()
        return res
    elif self.render_mode == "sensors":
        res = self.render_sensors()
        return res
    elif self.render_mode == "all":
        return self.render_all()
    elif self.render_mode == "hybrid":
        return self.hybrid_render()
    else:
        raise NotImplementedError(
            f"Unsupported render mode {self.render_mode}."
        )

render_gs3d_images

render_gs3d_images(layouts: list[str], num_envs: int, init_quat: list[float]) -> dict[str, np.ndarray]

Renders background images using a pre-trained Gaussian Splatting model.

This method pre-renders the static background for each environment from the perspective of all cameras to be used for hybrid rendering.

Parameters:

Name	Type	Description	Default
`layouts`	`list[str]`	A list of file paths to the environment layouts.	required
`num_envs`	`int`	The number of environments.	required
`init_quat`	`list[float]`	An initial quaternion to orient the Gaussian Splatting model.	required

Returns:

Type	Description
`dict[str, ndarray]`	A dictionary mapping a unique key (e.g., 'camera-env_idx') to the
`dict[str, ndarray]`	rendered background image as a numpy array.

Source code in embodied_gen/envs/pick_embodiedgen.py

def render_gs3d_images(
    self, layouts: list[str], num_envs: int, init_quat: list[float]
) -> dict[str, np.ndarray]:
    """Renders background images using a pre-trained Gaussian Splatting model.

    This method pre-renders the static background for each environment from
    the perspective of all cameras to be used for hybrid rendering.

    Args:
        layouts: A list of file paths to the environment layouts.
        num_envs: The number of environments.
        init_quat: An initial quaternion to orient the Gaussian Splatting
            model.

    Returns:
        A dictionary mapping a unique key (e.g., 'camera-env_idx') to the
        rendered background image as a numpy array.
    """
    sim_coord_align = (
        torch.tensor(SIM_COORD_ALIGN).to(torch.float32).to(self.device)
    )
    cameras = self.scene.sensors.copy()
    cameras.update(self.scene.human_render_cameras)

    # Preload the background Gaussian Splatting model.
    asset_root = os.path.dirname(layouts[0])
    layout = LayoutInfo.from_dict(json.load(open(layouts[0], "r")))
    bg_node = layout.relation[Scene3DItemEnum.BACKGROUND.value]
    gs_path = os.path.join(
        asset_root, layout.assets[bg_node], "gs_model.ply"
    )
    raw_gs: GaussianOperator = GaussianOperator.load_from_ply(gs_path)
    bg_images = dict()
    for env_idx in tqdm(range(num_envs), desc="Pre-rendering Background"):
        layout = json.load(open(layouts[env_idx], "r"))
        layout = LayoutInfo.from_dict(layout)
        x, y, z, qx, qy, qz, qw = layout.position[bg_node]
        qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat)
        init_pose = torch.tensor([x, y, z, qx, qy, qz, qw])
        gs_model = raw_gs.get_gaussians(instance_pose=init_pose)
        for key in cameras:
            camera = cameras[key]
            Ks = camera.camera.get_intrinsic_matrix()  # (n_env, 3, 3)
            c2w = camera.camera.get_model_matrix()  # (n_env, 4, 4)
            result = gs_model.render(
                c2w[env_idx] @ sim_coord_align,
                Ks[env_idx],
                image_width=camera.config.width,
                image_height=camera.config.height,
            )
            bg_images[f"{key}-env{env_idx}"] = result.rgb[..., ::-1]

    return bg_images

render_rgb_array

render_rgb_array(camera_name: str = None, return_alpha: bool = False)

Renders an RGB image from the human-facing render camera.

Parameters:

Name	Type	Description	Default
`camera_name`	`str`	The name of the camera to render from. If None, uses all human render cameras.	`None`
`return_alpha`	`bool`	Whether to include the alpha channel in the output.	`False`

Returns:

Type	Description
	A numpy array representing the rendered image(s). If multiple
	cameras are used, the images are tiled.

Source code in embodied_gen/envs/pick_embodiedgen.py

def render_rgb_array(
    self, camera_name: str = None, return_alpha: bool = False
):
    """Renders an RGB image from the human-facing render camera.

    Args:
        camera_name: The name of the camera to render from. If None, uses
            all human render cameras.
        return_alpha: Whether to include the alpha channel in the output.

    Returns:
        A numpy array representing the rendered image(s). If multiple
        cameras are used, the images are tiled.
    """
    for obj in self._hidden_objects:
        obj.show_visual()
    self.scene.update_render(
        update_sensors=False, update_human_render_cameras=True
    )
    images = []
    render_images = self.scene.get_human_render_camera_images(
        camera_name, return_alpha
    )
    for image in render_images.values():
        images.append(image)
    if len(images) == 0:
        return None
    if len(images) == 1:
        return images[0]
    for obj in self._hidden_objects:
        obj.hide_visual()
    return tile_images(images)

render_sensors

render_sensors()

Renders images from all on-board sensor cameras.

Returns:

Type	Description
	A tiled image of all sensor outputs as a numpy array.

Source code in embodied_gen/envs/pick_embodiedgen.py

def render_sensors(self):
    """Renders images from all on-board sensor cameras.

    Returns:
        A tiled image of all sensor outputs as a numpy array.
    """
    images = []
    sensor_images = self.get_sensor_images()
    for image in sensor_images.values():
        for img in image.values():
            images.append(img)
    return tile_images(images)

Envs API

embodied_gen.envs.pick_embodiedgen

PickEmbodiedGen

compute_dense_reward

compute_normalized_dense_reward

compute_robot_init_pose staticmethod

evaluate

hybrid_render

init_env_layouts staticmethod

render

render_gs3d_images

render_rgb_array

render_sensors

compute_robot_init_pose `staticmethod`

init_env_layouts `staticmethod`