Trainer API

This section covers the training pipelines for various models.

embodied_gen.trainer.gsplat_trainer

Runner

Runner(local_rank: int, world_rank, world_size: int, cfg: GsplatTrainConfig)

Engine for training and testing from gsplat example.

Code from https://github.com/nerfstudio-project/gsplat/blob/main/examples/simple_trainer.py

Source code in embodied_gen/trainer/gsplat_trainer.py

def __init__(
    self,
    local_rank: int,
    world_rank,
    world_size: int,
    cfg: GsplatTrainConfig,
) -> None:
    set_random_seed(42 + local_rank)

    self.cfg = cfg
    self.world_rank = world_rank
    self.local_rank = local_rank
    self.world_size = world_size
    self.device = f"cuda:{local_rank}"

    # Where to dump results.
    os.makedirs(cfg.result_dir, exist_ok=True)

    # Setup output directories.
    self.ckpt_dir = f"{cfg.result_dir}/ckpts"
    os.makedirs(self.ckpt_dir, exist_ok=True)
    self.stats_dir = f"{cfg.result_dir}/stats"
    os.makedirs(self.stats_dir, exist_ok=True)
    self.render_dir = f"{cfg.result_dir}/renders"
    os.makedirs(self.render_dir, exist_ok=True)
    self.ply_dir = f"{cfg.result_dir}/ply"
    os.makedirs(self.ply_dir, exist_ok=True)

    # Tensorboard
    self.writer = SummaryWriter(log_dir=f"{cfg.result_dir}/tb")
    self.trainset = PanoGSplatDataset(cfg.data_dir, split="train")
    self.valset = PanoGSplatDataset(
        cfg.data_dir, split="train", max_sample_num=6
    )
    self.testset = PanoGSplatDataset(cfg.data_dir, split="eval")
    self.scene_scale = cfg.scene_scale

    # Model
    self.splats, self.optimizers = create_splats_with_optimizers(
        self.trainset.points,
        self.trainset.points_rgb,
        init_num_pts=cfg.init_num_pts,
        init_extent=cfg.init_extent,
        init_opacity=cfg.init_opa,
        init_scale=cfg.init_scale,
        means_lr=cfg.means_lr,
        scales_lr=cfg.scales_lr,
        opacities_lr=cfg.opacities_lr,
        quats_lr=cfg.quats_lr,
        sh0_lr=cfg.sh0_lr,
        shN_lr=cfg.shN_lr,
        scene_scale=self.scene_scale,
        sh_degree=cfg.sh_degree,
        sparse_grad=cfg.sparse_grad,
        visible_adam=cfg.visible_adam,
        batch_size=cfg.batch_size,
        feature_dim=None,
        device=self.device,
        world_rank=world_rank,
        world_size=world_size,
    )
    print("Model initialized. Number of GS:", len(self.splats["means"]))

    # Densification Strategy
    self.cfg.strategy.check_sanity(self.splats, self.optimizers)

    if isinstance(self.cfg.strategy, DefaultStrategy):
        self.strategy_state = self.cfg.strategy.initialize_state(
            scene_scale=self.scene_scale
        )
    elif isinstance(self.cfg.strategy, MCMCStrategy):
        self.strategy_state = self.cfg.strategy.initialize_state()
    else:
        assert_never(self.cfg.strategy)

    # Losses & Metrics.
    self.ssim = StructuralSimilarityIndexMeasure(data_range=1.0).to(
        self.device
    )
    self.psnr = PeakSignalNoiseRatio(data_range=1.0).to(self.device)

    if cfg.lpips_net == "alex":
        self.lpips = LearnedPerceptualImagePatchSimilarity(
            net_type="alex", normalize=True
        ).to(self.device)
    elif cfg.lpips_net == "vgg":
        # The 3DGS official repo uses lpips vgg, which is equivalent with the following:
        self.lpips = LearnedPerceptualImagePatchSimilarity(
            net_type="vgg", normalize=False
        ).to(self.device)
    else:
        raise ValueError(f"Unknown LPIPS network: {cfg.lpips_net}")

eval

eval(step: int, stage: str = 'val', canvas_h: int = 512, canvas_w: int = 1024)

Entry for evaluation.

Source code in embodied_gen/trainer/gsplat_trainer.py

@torch.no_grad()
def eval(
    self,
    step: int,
    stage: str = "val",
    canvas_h: int = 512,
    canvas_w: int = 1024,
):
    """Entry for evaluation."""
    print("Running evaluation...")
    cfg = self.cfg
    device = self.device
    world_rank = self.world_rank

    valloader = torch.utils.data.DataLoader(
        self.valset, batch_size=1, shuffle=False, num_workers=1
    )
    ellipse_time = 0
    metrics = defaultdict(list)
    for i, data in enumerate(valloader):
        camtoworlds = data["camtoworld"].to(device)
        Ks = data["K"].to(device)
        pixels = data["image"].to(device) / 255.0
        height, width = pixels.shape[1:3]
        masks = data["mask"].to(device) if "mask" in data else None

        pixels = pixels.permute(0, 3, 1, 2)  # NHWC -> NCHW
        pixels = F.interpolate(pixels, size=(canvas_h, canvas_w // 2))

        torch.cuda.synchronize()
        tic = time.time()
        colors, _, _ = self.rasterize_splats(
            camtoworlds=camtoworlds,
            Ks=Ks,
            width=width,
            height=height,
            sh_degree=cfg.sh_degree,
            near_plane=cfg.near_plane,
            far_plane=cfg.far_plane,
            masks=masks,
        )  # [1, H, W, 3]
        torch.cuda.synchronize()
        ellipse_time += max(time.time() - tic, 1e-10)

        colors = colors.permute(0, 3, 1, 2)  # NHWC -> NCHW
        colors = F.interpolate(colors, size=(canvas_h, canvas_w // 2))
        colors = torch.clamp(colors, 0.0, 1.0)
        canvas_list = [pixels, colors]

        if world_rank == 0:
            canvas = torch.cat(canvas_list, dim=2).squeeze(0)
            canvas = canvas.permute(1, 2, 0)  # CHW -> HWC
            canvas = (canvas * 255).to(torch.uint8).cpu().numpy()
            cv2.imwrite(
                f"{self.render_dir}/{stage}_step{step}_{i:04d}.png",
                canvas[..., ::-1],
            )
            metrics["psnr"].append(self.psnr(colors, pixels))
            metrics["ssim"].append(self.ssim(colors, pixels))
            metrics["lpips"].append(self.lpips(colors, pixels))

    if world_rank == 0:
        ellipse_time /= len(valloader)

        stats = {
            k: torch.stack(v).mean().item() for k, v in metrics.items()
        }
        stats.update(
            {
                "ellipse_time": ellipse_time,
                "num_GS": len(self.splats["means"]),
            }
        )
        print(
            f"PSNR: {stats['psnr']:.3f}, SSIM: {stats['ssim']:.4f}, LPIPS: {stats['lpips']:.3f} "
            f"Time: {stats['ellipse_time']:.3f}s/image "
            f"Number of GS: {stats['num_GS']}"
        )
        # save stats as json
        with open(
            f"{self.stats_dir}/{stage}_step{step:04d}.json", "w"
        ) as f:
            json.dump(stats, f)
        # save stats to tensorboard
        for k, v in stats.items():
            self.writer.add_scalar(f"{stage}/{k}", v, step)
        self.writer.flush()

embodied_gen.trainer.pono2mesh_trainer

Pano2MeshSRPipeline

Pano2MeshSRPipeline(config: Pano2MeshSRConfig)

Pipeline for converting panoramic RGB images into 3D mesh representations.

This class integrates depth estimation, inpainting, mesh conversion, multi-view mesh repair, and 3D Gaussian Splatting (3DGS) dataset generation.

Parameters:

Name	Type	Description	Default
`config`	`Pano2MeshSRConfig`	Configuration object containing model and pipeline parameters.	required

Example

from embodied_gen.trainer.pono2mesh_trainer import Pano2MeshSRPipeline
from embodied_gen.utils.config import Pano2MeshSRConfig

config = Pano2MeshSRConfig()
pipeline = Pano2MeshSRPipeline(config)
pipeline(pano_image='example.png', output_dir='./output')

Initializes the pipeline with models and camera poses.

Parameters:

Name	Type	Description	Default
`config`	`Pano2MeshSRConfig`	Configuration object.	required

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def __init__(self, config: Pano2MeshSRConfig) -> None:
    """Initializes the pipeline with models and camera poses.

    Args:
        config (Pano2MeshSRConfig): Configuration object.
    """
    self.cfg = config
    self.device = config.device

    # Init models.
    self.inpainter = PanoPersFusionInpainter(save_path=None)
    self.geo_predictor = PanoJointPredictor(save_path=None)
    self.pano_fusion_distance_predictor = PanoFusionDistancePredictor()
    self.super_model = ImageRealESRGAN(outscale=self.cfg.upscale_factor)

    # Init poses.
    cubemap_w2cs = get_cubemap_views_world_to_cam()
    self.cubemap_w2cs = [p.to(self.device) for p in cubemap_w2cs]
    self.camera_poses = self.load_camera_poses(self.cfg.trajectory_dir)

    kernel = cv2.getStructuringElement(
        cv2.MORPH_ELLIPSE, self.cfg.kernel_size
    )
    self.kernel = torch.from_numpy(kernel).float().to(self.device)

call

__call__(pano_image: Image | str, output_dir: str)

Runs the pipeline to generate mesh and 3DGS data from a panoramic image.

Parameters:

Name	Type	Description	Default
`pano_image`	`Image \| str`	Input panoramic image or path.	required
`output_dir`	`str`	Directory to save outputs.	required

Returns:

Type	Description
	None

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def __call__(self, pano_image: Image.Image | str, output_dir: str):
    """Runs the pipeline to generate mesh and 3DGS data from a panoramic image.

    Args:
        pano_image (Image.Image | str): Input panoramic image or path.
        output_dir (str): Directory to save outputs.

    Returns:
        None
    """
    self.init_mesh_params()
    pano_rgb, pano_depth = self.preprocess_pano(pano_image)
    self.sup_pool = SupInfoPool()
    self.sup_pool.register_sup_info(
        pose=torch.eye(4).to(self.device),
        mask=torch.ones([self.cfg.pano_h, self.cfg.pano_w]),
        rgb=pano_rgb.permute(1, 2, 0),
        distance=pano_depth[..., None],
    )
    self.sup_pool.gen_occ_grid(res=256)

    logger.info("Init mesh from pano RGBD image...")
    depth_edge = self.get_edge_image_by_depth(pano_depth)
    inpaint_edge_mask = (
        ~torch.from_numpy(depth_edge).to(self.device).bool()
    )
    self.rgbd_to_mesh(pano_rgb, pano_depth, inpaint_edge_mask)

    repair_poses = self.load_inpaint_poses(self.camera_poses)
    inpainted_panos_w_poses = self.mesh_repair_by_greedy_view_selection(
        repair_poses, output_dir
    )
    torch.cuda.empty_cache()
    torch.set_default_device("cpu")

    if self.cfg.mesh_file is not None:
        mesh_path = os.path.join(output_dir, self.cfg.mesh_file)
        self.save_mesh(mesh_path)

    if self.cfg.gs_data_file is None:
        return

    logger.info(f"Dump data for 3DGS training...")
    points_rgb = (self.colors.clip(0, 1) * 255).to(torch.uint8)
    data = {
        "points": self.vertices.permute(1, 0).cpu().numpy(),  # (N, 3)
        "points_rgb": points_rgb.permute(1, 0).cpu().numpy(),  # (N, 3)
        "train": [],
        "eval": [],
    }
    image_h = self.cfg.cubemap_h * self.cfg.upscale_factor
    image_w = self.cfg.cubemap_w * self.cfg.upscale_factor
    Ks = compute_pinhole_intrinsics(image_w, image_h, self.cfg.fov)
    for idx, (pano_img, pano_pose) in enumerate(inpainted_panos_w_poses):
        cubemaps = self.pano_to_cubemap(pano_img)
        for i in range(len(cubemaps)):
            cubemap = tensor_to_pil(cubemaps[i])
            cubemap = self.super_model(cubemap)
            mesh_pose = self.cubemap_w2cs[i] @ pano_pose
            c2w = self.mesh_pose_to_gs_pose(mesh_pose)
            data["train"].append(
                {
                    "camtoworld": c2w.astype(np.float32),
                    "K": Ks.astype(np.float32),
                    "image": np.array(cubemap),
                    "image_h": image_h,
                    "image_w": image_w,
                    "image_id": len(cubemaps) * idx + i,
                }
            )

    # Camera poses for evaluation.
    for idx in range(len(self.camera_poses)):
        c2w = self.mesh_pose_to_gs_pose(self.camera_poses[idx])
        data["eval"].append(
            {
                "camtoworld": c2w.astype(np.float32),
                "K": Ks.astype(np.float32),
                "image_h": image_h,
                "image_w": image_w,
                "image_id": idx,
            }
        )

    data_path = os.path.join(output_dir, self.cfg.gs_data_file)
    torch.save(data, data_path)

    return

get_edge_image_by_depth

get_edge_image_by_depth(depth: Tensor, dilate_iter: int = 1) -> np.ndarray

Computes edge image from depth map.

Parameters:

Name	Type	Description	Default
`depth`	`Tensor`	Depth map tensor.	required
`dilate_iter`	`int`	Number of dilation iterations.	`1`

Returns:

Type	Description
`ndarray`	np.ndarray: Edge image.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def get_edge_image_by_depth(
    self, depth: torch.Tensor, dilate_iter: int = 1
) -> np.ndarray:
    """Computes edge image from depth map.

    Args:
        depth (torch.Tensor): Depth map tensor.
        dilate_iter (int, optional): Number of dilation iterations.

    Returns:
        np.ndarray: Edge image.
    """
    if isinstance(depth, torch.Tensor):
        depth = depth.cpu().detach().numpy()

    gray = (depth / depth.max() * 255).astype(np.uint8)
    edges = cv2.Canny(gray, 60, 150)
    if dilate_iter > 0:
        kernel = np.ones((3, 3), np.uint8)
        edges = cv2.dilate(edges, kernel, iterations=dilate_iter)

    return edges

init_mesh_params

init_mesh_params() -> None

Initializes mesh parameters and inpaint mask.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def init_mesh_params(self) -> None:
    """Initializes mesh parameters and inpaint mask."""
    torch.set_default_device(self.device)
    self.inpaint_mask = torch.ones(
        (self.cfg.cubemap_h, self.cfg.cubemap_w), dtype=torch.bool
    )
    self.vertices = torch.empty((3, 0), requires_grad=False)
    self.colors = torch.empty((3, 0), requires_grad=False)
    self.faces = torch.empty((3, 0), dtype=torch.long, requires_grad=False)

inpaint_panorama

inpaint_panorama(idx: int, colors: Tensor, distances: Tensor, pano_mask: Tensor) -> tuple[torch.Tensor]

Inpaints missing regions in a panorama.

Parameters:

Name	Type	Description	Default
`idx`	`int`	Index of the panorama.	required
`colors`	`Tensor`	RGB image tensor.	required
`distances`	`Tensor`	Distance map tensor.	required
`pano_mask`	`Tensor`	Mask tensor.	required

Returns:

Type	Description
`tuple[Tensor]`	tuple[torch.Tensor]: Inpainted RGB image, distances, and normals.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def inpaint_panorama(
    self,
    idx: int,
    colors: torch.Tensor,
    distances: torch.Tensor,
    pano_mask: torch.Tensor,
) -> tuple[torch.Tensor]:
    """Inpaints missing regions in a panorama.

    Args:
        idx (int): Index of the panorama.
        colors (torch.Tensor): RGB image tensor.
        distances (torch.Tensor): Distance map tensor.
        pano_mask (torch.Tensor): Mask tensor.

    Returns:
        tuple[torch.Tensor]: Inpainted RGB image, distances, and normals.
    """
    mask = (pano_mask[None, ..., None] > 0.5).float()
    mask = mask.permute(0, 3, 1, 2)
    mask = dilation(mask, kernel=self.kernel)
    mask = mask[0, 0, ..., None]  # hwc
    inpainted_img = self.inpainter.inpaint(idx, colors, mask)
    inpainted_img = colors * (1 - mask) + inpainted_img * mask
    inpainted_distances, inpainted_normals = self.geo_predictor(
        idx,
        inpainted_img,
        distances[..., None],
        mask=mask,
        reg_loss_weight=0.0,
        normal_loss_weight=5e-2,
        normal_tv_loss_weight=5e-2,
    )

    return inpainted_img, inpainted_distances.squeeze(), inpainted_normals

load_camera_poses

load_camera_poses(trajectory_dir: str) -> tuple[np.ndarray, list[torch.Tensor]]

Loads camera poses from a directory.

Parameters:

Name	Type	Description	Default
`trajectory_dir`	`str`	Directory containing camera pose files.	required

Returns:

Type	Description
`tuple[ndarray, list[Tensor]]`	tuple[np.ndarray, list[torch.Tensor]]: List of relative camera poses.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def load_camera_poses(
    self, trajectory_dir: str
) -> tuple[np.ndarray, list[torch.Tensor]]:
    """Loads camera poses from a directory.

    Args:
        trajectory_dir (str): Directory containing camera pose files.

    Returns:
        tuple[np.ndarray, list[torch.Tensor]]: List of relative camera poses.
    """
    pose_filenames = sorted(
        [
            fname
            for fname in os.listdir(trajectory_dir)
            if fname.startswith("camera_pose")
        ]
    )

    pano_pose_world = None
    relative_poses = []
    for idx, filename in enumerate(pose_filenames):
        pose_path = os.path.join(trajectory_dir, filename)
        pose_matrix = self.read_camera_pose_file(pose_path)

        if pano_pose_world is None:
            pano_pose_world = pose_matrix.copy()
            pano_pose_world[0, 3] += self.cfg.pano_center_offset[0]
            pano_pose_world[2, 3] += self.cfg.pano_center_offset[1]

        # Use different reference for the first 6 cubemap views
        reference_pose = pose_matrix if idx < 6 else pano_pose_world
        relative_matrix = pose_matrix @ np.linalg.inv(reference_pose)
        relative_matrix[0:2, :] *= -1  # flip_xy
        relative_matrix = (
            relative_matrix @ rot_z_world_to_cam(180).cpu().numpy()
        )
        relative_matrix[:3, 3] *= self.cfg.pose_scale
        relative_matrix = torch.tensor(
            relative_matrix, dtype=torch.float32
        )
        relative_poses.append(relative_matrix)

    return relative_poses

load_inpaint_poses

load_inpaint_poses(poses: Tensor) -> dict[int, torch.Tensor]

Samples and loads poses for inpainting.

Parameters:

Name	Type	Description	Default
`poses`	`Tensor`	Tensor of camera poses.	required

Returns:

Type	Description
`dict[int, Tensor]`	dict[int, torch.Tensor]: Dictionary mapping indices to pose tensors.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def load_inpaint_poses(
    self, poses: torch.Tensor
) -> dict[int, torch.Tensor]:
    """Samples and loads poses for inpainting.

    Args:
        poses (torch.Tensor): Tensor of camera poses.

    Returns:
        dict[int, torch.Tensor]: Dictionary mapping indices to pose tensors.
    """
    inpaint_poses = dict()
    sampled_views = poses[:: self.cfg.inpaint_frame_stride]
    init_pose = torch.eye(4)
    for idx, w2c_tensor in enumerate(sampled_views):
        w2c = w2c_tensor.cpu().numpy().astype(np.float32)
        c2w = np.linalg.inv(w2c)
        pose_tensor = init_pose.clone()
        pose_tensor[:3, 3] = torch.from_numpy(c2w[:3, 3])
        pose_tensor[:3, 3] *= -1
        inpaint_poses[idx] = pose_tensor.to(self.device)

    return inpaint_poses

mesh_pose_to_gs_pose

mesh_pose_to_gs_pose(mesh_pose: Tensor) -> np.ndarray

Converts mesh pose to 3D Gaussian Splatting pose.

Parameters:

Name	Type	Description	Default
`mesh_pose`	`Tensor`	Mesh pose tensor.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Converted pose matrix.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def mesh_pose_to_gs_pose(self, mesh_pose: torch.Tensor) -> np.ndarray:
    """Converts mesh pose to 3D Gaussian Splatting pose.

    Args:
        mesh_pose (torch.Tensor): Mesh pose tensor.

    Returns:
        np.ndarray: Converted pose matrix.
    """
    pose = mesh_pose.clone()
    pose[0, :] *= -1
    pose[1, :] *= -1

    Rw2c = pose[:3, :3].cpu().numpy()
    Tw2c = pose[:3, 3:].cpu().numpy()
    yz_reverse = np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]])

    Rc2w = (yz_reverse @ Rw2c).T
    Tc2w = -(Rc2w @ yz_reverse @ Tw2c)
    c2w = np.concatenate((Rc2w, Tc2w), axis=1)
    c2w = np.concatenate((c2w, np.array([[0, 0, 0, 1]])), axis=0)

    return c2w

mesh_repair_by_greedy_view_selection

mesh_repair_by_greedy_view_selection(pose_dict: dict[str, Tensor], output_dir: str) -> list

Repairs mesh by selecting views greedily and inpainting missing regions.

Parameters:

Name	Type	Description	Default
`pose_dict`	`dict[str, Tensor]`	Dictionary of poses for inpainting.	required
`output_dir`	`str`	Directory to save visualizations.	required

Returns:

Name	Type	Description
`list`	`list`	List of inpainted panoramas with poses.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def mesh_repair_by_greedy_view_selection(
    self, pose_dict: dict[str, torch.Tensor], output_dir: str
) -> list:
    """Repairs mesh by selecting views greedily and inpainting missing regions.

    Args:
        pose_dict (dict[str, torch.Tensor]): Dictionary of poses for inpainting.
        output_dir (str): Directory to save visualizations.

    Returns:
        list: List of inpainted panoramas with poses.
    """
    inpainted_panos_w_pose = []
    while len(pose_dict) > 0:
        logger.info(f"Repairing mesh left rounds {len(pose_dict)}")
        sampled_views = []
        for key, pose in pose_dict.items():
            pano_rgb, pano_distance, pano_mask = self.render_pano(pose)
            completeness = torch.sum(1 - pano_mask) / (pano_mask.numel())
            sampled_views.append((key, completeness.item(), pose))

        if len(sampled_views) == 0:
            break

        # Find inpainting with least view completeness.
        sampled_views = sorted(sampled_views, key=lambda x: x[1])
        key, _, pose = sampled_views[len(sampled_views) * 2 // 3]
        pose_dict.pop(key)

        pano_rgb, pano_distance, pano_mask = self.render_pano(pose)

        colors = pano_rgb.permute(1, 2, 0).clone()
        distances = pano_distance.unsqueeze(-1).clone()
        pano_inpaint_mask = pano_mask.clone()
        init_pose = pose.clone()
        normals = None
        if pano_inpaint_mask.min().item() < 0.5:
            colors, distances, normals = self.inpaint_panorama(
                idx=key,
                colors=colors,
                distances=distances,
                pano_mask=pano_inpaint_mask,
            )

            init_pose[0, 3], init_pose[1, 3], init_pose[2, 3] = (
                -pose[0, 3],
                pose[2, 3],
                0,
            )
            rays = gen_pano_rays(
                init_pose, self.cfg.pano_h, self.cfg.pano_w
            )
            conflict_mask = self.sup_pool.geo_check(
                rays, distances.unsqueeze(-1)
            )  # 0 is conflict, 1 not conflict
            pano_inpaint_mask *= conflict_mask

        self.rgbd_to_mesh(
            colors.permute(2, 0, 1),
            distances,
            pano_inpaint_mask,
            world_to_cam=pose,
        )

        self.sup_pool.register_sup_info(
            pose=init_pose,
            mask=pano_inpaint_mask.clone(),
            rgb=colors,
            distance=distances.unsqueeze(-1),
            normal=normals,
        )

        colors = colors.permute(2, 0, 1).unsqueeze(0)
        inpainted_panos_w_pose.append([colors, pose])

        if self.cfg.visualize:
            from embodied_gen.data.utils import DiffrastRender

            tensor_to_pil(pano_rgb.unsqueeze(0)).save(
                f"{output_dir}/rendered_pano_{key}.jpg"
            )
            tensor_to_pil(colors).save(
                f"{output_dir}/inpainted_pano_{key}.jpg"
            )
            norm_depth = DiffrastRender.normalize_map_by_mask(
                distances, torch.ones_like(distances)
            )
            heatmap = (norm_depth.cpu().numpy() * 255).astype(np.uint8)
            heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
            Image.fromarray(heatmap).save(
                f"{output_dir}/inpainted_depth_{key}.png"
            )

    return inpainted_panos_w_pose

pano_to_cubemap

pano_to_cubemap(pano_rgb: Tensor)

Converts a panoramic RGB image to six cubemap views.

Parameters:

Name	Type	Description	Default
`pano_rgb`	`Tensor`	Panoramic RGB image tensor.	required

Returns:

Name	Type	Description
`list`		List of cubemap RGB tensors.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def pano_to_cubemap(self, pano_rgb: torch.Tensor):
    """Converts a panoramic RGB image to six cubemap views.

    Args:
        pano_rgb (torch.Tensor): Panoramic RGB image tensor.

    Returns:
        list: List of cubemap RGB tensors.
    """
    # Define six canonical cube directions in (pitch, yaw)
    directions = [
        (0, 0),
        (0, 1.5 * np.pi),
        (0, 1.0 * np.pi),
        (0, 0.5 * np.pi),
        (-0.5 * np.pi, 0),
        (0.5 * np.pi, 0),
    ]

    cubemaps_rgb = []
    for pitch, yaw in directions:
        rgb_view = self.pano_to_perpective(
            pano_rgb, pitch, yaw, fov=self.cfg.fov
        )
        cubemaps_rgb.append(rgb_view.cpu())

    return cubemaps_rgb

pano_to_perpective

pano_to_perpective(pano_image: Tensor, pitch: float, yaw: float, fov: float) -> torch.Tensor

Converts a panoramic image to a perspective view.

Parameters:

Name	Type	Description	Default
`pano_image`	`Tensor`	Panoramic image tensor.	required
`pitch`	`float`	Pitch angle.	required
`yaw`	`float`	Yaw angle.	required
`fov`	`float`	Field of view.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Perspective image tensor.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def pano_to_perpective(
    self, pano_image: torch.Tensor, pitch: float, yaw: float, fov: float
) -> torch.Tensor:
    """Converts a panoramic image to a perspective view.

    Args:
        pano_image (torch.Tensor): Panoramic image tensor.
        pitch (float): Pitch angle.
        yaw (float): Yaw angle.
        fov (float): Field of view.

    Returns:
        torch.Tensor: Perspective image tensor.
    """
    rots = dict(
        roll=0,
        pitch=pitch,
        yaw=yaw,
    )
    perspective = equi2pers(
        equi=pano_image.squeeze(0),
        rots=rots,
        height=self.cfg.cubemap_h,
        width=self.cfg.cubemap_w,
        fov_x=fov,
        mode="bilinear",
    ).unsqueeze(0)

    return perspective

preprocess_pano

preprocess_pano(image: Image | str) -> tuple[torch.Tensor, torch.Tensor]

Preprocesses a panoramic image for mesh generation.

Parameters:

Name	Type	Description	Default
`image`	`Image \| str`	Input image or path.	required

Returns:

Type	Description
`tuple[Tensor, Tensor]`	tuple[torch.Tensor, torch.Tensor]: Preprocessed RGB and depth tensors.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def preprocess_pano(
    self, image: Image.Image | str
) -> tuple[torch.Tensor, torch.Tensor]:
    """Preprocesses a panoramic image for mesh generation.

    Args:
        image (Image.Image | str): Input image or path.

    Returns:
        tuple[torch.Tensor, torch.Tensor]: Preprocessed RGB and depth tensors.
    """
    if isinstance(image, str):
        image = Image.open(image)

    image = image.convert("RGB")

    if image.size[0] < image.size[1]:
        image = image.transpose(Image.TRANSPOSE)

    image = resize_image_with_aspect_ratio(image, self.cfg.pano_w)
    image_rgb = torch.tensor(np.array(image)).permute(2, 0, 1) / 255
    image_rgb = image_rgb.to(self.device)
    image_depth = self.pano_fusion_distance_predictor.predict(
        image_rgb.permute(1, 2, 0)
    )
    image_depth = (
        image_depth / image_depth.max() * self.cfg.depth_scale_factor
    )

    return image_rgb, image_depth

project

project(world_to_cam: Tensor)

Projects the mesh to an image using the given camera pose.

Parameters:

Name	Type	Description	Default
`world_to_cam`	`Tensor`	World-to-camera transformation matrix.	required

Returns:

Type	Description
	Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Projected RGB image, inpaint mask, and depth map.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def project(self, world_to_cam: torch.Tensor):
    """Projects the mesh to an image using the given camera pose.

    Args:
        world_to_cam (torch.Tensor): World-to-camera transformation matrix.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Projected RGB image, inpaint mask, and depth map.
    """
    (
        project_image,
        project_depth,
        inpaint_mask,
        _,
        z_buf,
        mesh,
    ) = render_mesh(
        vertices=self.vertices,
        faces=self.faces,
        vertex_features=self.colors,
        H=self.cfg.cubemap_h,
        W=self.cfg.cubemap_w,
        fov_in_degrees=self.cfg.fov,
        RT=world_to_cam,
        blur_radius=self.cfg.blur_radius,
        faces_per_pixel=self.cfg.faces_per_pixel,
    )
    project_image = project_image * ~inpaint_mask

    return project_image[:3, ...], inpaint_mask, project_depth

read_camera_pose_file `staticmethod`

read_camera_pose_file(filepath: str) -> np.ndarray

Reads a camera pose file and returns the pose matrix.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path to the camera pose file.	required

Returns:

Type	Description
`ndarray`	np.ndarray: 4x4 camera pose matrix.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

@staticmethod
def read_camera_pose_file(filepath: str) -> np.ndarray:
    """Reads a camera pose file and returns the pose matrix.

    Args:
        filepath (str): Path to the camera pose file.

    Returns:
        np.ndarray: 4x4 camera pose matrix.
    """
    with open(filepath, "r") as f:
        values = [float(num) for line in f for num in line.split()]

    return np.array(values).reshape(4, 4)

render_pano

render_pano(pose: Tensor)

Renders a panorama from the mesh using the given pose.

Parameters:

Name	Type	Description	Default
`pose`	`Tensor`	Camera pose.	required

Returns:

Type	Description
	Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: RGB panorama, depth map, and mask.

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def render_pano(self, pose: torch.Tensor):
    """Renders a panorama from the mesh using the given pose.

    Args:
        pose (torch.Tensor): Camera pose.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: RGB panorama, depth map, and mask.
    """
    cubemap_list = []
    for cubemap_pose in self.cubemap_w2cs:
        project_pose = cubemap_pose @ pose
        rgb, inpaint_mask, depth = self.project(project_pose)
        distance_map = depth_to_distance(depth[None, ...])
        mask = inpaint_mask[None, ...]
        cubemap_list.append(torch.cat([rgb, distance_map, mask], dim=0))

    # Set default tensor type for CPU operation in cube2equi
    with torch.device("cpu"):
        pano_rgbd = cube2equi(
            cubemap_list, "list", self.cfg.pano_h, self.cfg.pano_w
        )

    pano_rgb = pano_rgbd[:3, :, :]
    pano_depth = pano_rgbd[3:4, :, :].squeeze(0)
    pano_mask = pano_rgbd[4:, :, :].squeeze(0)

    return pano_rgb, pano_depth, pano_mask

rgbd_to_mesh

rgbd_to_mesh(rgb: Tensor, depth: Tensor, inpaint_mask: Tensor, world_to_cam: Tensor = None, using_distance_map: bool = True) -> None

Converts RGB-D images to mesh and updates mesh parameters.

Parameters:

Name	Type	Description	Default
`rgb`	`Tensor`	RGB image tensor.	required
`depth`	`Tensor`	Depth map tensor.	required
`inpaint_mask`	`Tensor`	Inpaint mask tensor.	required
`world_to_cam`	`Tensor`	Camera pose.	`None`
`using_distance_map`	`bool`	Whether to use distance map.	`True`

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def rgbd_to_mesh(
    self,
    rgb: torch.Tensor,
    depth: torch.Tensor,
    inpaint_mask: torch.Tensor,
    world_to_cam: torch.Tensor = None,
    using_distance_map: bool = True,
) -> None:
    """Converts RGB-D images to mesh and updates mesh parameters.

    Args:
        rgb (torch.Tensor): RGB image tensor.
        depth (torch.Tensor): Depth map tensor.
        inpaint_mask (torch.Tensor): Inpaint mask tensor.
        world_to_cam (torch.Tensor, optional): Camera pose.
        using_distance_map (bool, optional): Whether to use distance map.
    """
    if world_to_cam is None:
        world_to_cam = torch.eye(4, dtype=torch.float32).to(self.device)

    if inpaint_mask.sum() == 0:
        return

    vertices, faces, colors = features_to_world_space_mesh(
        colors=rgb.squeeze(0),
        depth=depth,
        fov_in_degrees=self.cfg.fov,
        world_to_cam=world_to_cam,
        mask=inpaint_mask,
        faces=self.faces,
        vertices=self.vertices,
        using_distance_map=using_distance_map,
        edge_threshold=0.05,
    )

    faces += self.vertices.shape[1]
    self.vertices = torch.cat([self.vertices, vertices], dim=1)
    self.colors = torch.cat([self.colors, colors], dim=1)
    self.faces = torch.cat([self.faces, faces], dim=1)

save_mesh

save_mesh(output_path: str) -> None

Saves the mesh to a file.

Parameters:

Name	Type	Description	Default
`output_path`	`str`	Path to save the mesh file.	required

Source code in embodied_gen/trainer/pono2mesh_trainer.py

def save_mesh(self, output_path: str) -> None:
    """Saves the mesh to a file.

    Args:
        output_path (str): Path to save the mesh file.
    """
    vertices_np = self.vertices.T.cpu().numpy()
    colors_np = self.colors.T.cpu().numpy()
    faces_np = self.faces.T.cpu().numpy()
    mesh = trimesh.Trimesh(
        vertices=vertices_np, faces=faces_np, vertex_colors=colors_np
    )

    mesh.export(output_path)

Trainer API

embodied_gen.trainer.gsplat_trainer

Runner

eval

embodied_gen.trainer.pono2mesh_trainer

Pano2MeshSRPipeline

__call__

get_edge_image_by_depth

init_mesh_params

inpaint_panorama

load_camera_poses

load_inpaint_poses

mesh_pose_to_gs_pose

mesh_repair_by_greedy_view_selection

pano_to_cubemap

pano_to_perpective

preprocess_pano

project

read_camera_pose_file staticmethod

render_pano

rgbd_to_mesh

save_mesh

call

read_camera_pose_file `staticmethod`