instant-mesh/src/model.py

import os
import numpy as np
import torch
import torch.nn.functional as F
from torchvision.transforms import v2
from torchvision.utils import make_grid, save_image
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
import pytorch_lightning as pl
from einops import rearrange, repeat

from src.utils.train_util import instantiate_from_config


class MVRecon(pl.LightningModule):
    def __init__(
        self,
        lrm_generator_config,
        lrm_path=None,
        input_size=256,
        render_size=192,
    ):
        super(MVRecon, self).__init__()

        self.input_size = input_size
        self.render_size = render_size

        # init modules
        self.lrm_generator = instantiate_from_config(lrm_generator_config)
        if lrm_path is not None:
            lrm_ckpt = torch.load(lrm_path)
            self.lrm_generator.load_state_dict(lrm_ckpt['weights'], strict=False)

        self.lpips = LearnedPerceptualImagePatchSimilarity(net_type='vgg')
        
        self.validation_step_outputs = []
    
    def on_fit_start(self):
        if self.global_rank == 0:
            os.makedirs(os.path.join(self.logdir, 'images'), exist_ok=True)
            os.makedirs(os.path.join(self.logdir, 'images_val'), exist_ok=True)
    
    def prepare_batch_data(self, batch):
        lrm_generator_input = {}
        render_gt = {}   # for supervision

        # input images
        images = batch['input_images']
        images = v2.functional.resize(
            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)

        lrm_generator_input['images'] = images.to(self.device)

        # input cameras and render cameras
        input_c2ws = batch['input_c2ws'].flatten(-2)
        input_Ks = batch['input_Ks'].flatten(-2)
        target_c2ws = batch['target_c2ws'].flatten(-2)
        target_Ks = batch['target_Ks'].flatten(-2)
        render_cameras_input = torch.cat([input_c2ws, input_Ks], dim=-1)
        render_cameras_target = torch.cat([target_c2ws, target_Ks], dim=-1)
        render_cameras = torch.cat([render_cameras_input, render_cameras_target], dim=1)

        input_extrinsics = input_c2ws[:, :, :12]
        input_intrinsics = torch.stack([
            input_Ks[:, :, 0], input_Ks[:, :, 4], 
            input_Ks[:, :, 2], input_Ks[:, :, 5],
        ], dim=-1)
        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)

        # add noise to input cameras
        cameras = cameras + torch.rand_like(cameras) * 0.04 - 0.02

        lrm_generator_input['cameras'] = cameras.to(self.device)
        lrm_generator_input['render_cameras'] = render_cameras.to(self.device)

        # target images
        target_images = torch.cat([batch['input_images'], batch['target_images']], dim=1)
        target_depths = torch.cat([batch['input_depths'], batch['target_depths']], dim=1)
        target_alphas = torch.cat([batch['input_alphas'], batch['target_alphas']], dim=1)

        # random crop
        render_size = np.random.randint(self.render_size, 513)
        target_images = v2.functional.resize(
            target_images, render_size, interpolation=3, antialias=True).clamp(0, 1)
        target_depths = v2.functional.resize(
            target_depths, render_size, interpolation=0, antialias=True)
        target_alphas = v2.functional.resize(
            target_alphas, render_size, interpolation=0, antialias=True)

        crop_params = v2.RandomCrop.get_params(
            target_images, output_size=(self.render_size, self.render_size))
        target_images = v2.functional.crop(target_images, *crop_params)
        target_depths = v2.functional.crop(target_depths, *crop_params)[:, :, 0:1]
        target_alphas = v2.functional.crop(target_alphas, *crop_params)[:, :, 0:1]

        lrm_generator_input['render_size'] = render_size
        lrm_generator_input['crop_params'] = crop_params

        render_gt['target_images'] = target_images.to(self.device)
        render_gt['target_depths'] = target_depths.to(self.device)
        render_gt['target_alphas'] = target_alphas.to(self.device)

        return lrm_generator_input, render_gt
    
    def prepare_validation_batch_data(self, batch):
        lrm_generator_input = {}

        # input images
        images = batch['input_images']
        images = v2.functional.resize(
            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)

        lrm_generator_input['images'] = images.to(self.device)

        input_c2ws = batch['input_c2ws'].flatten(-2)
        input_Ks = batch['input_Ks'].flatten(-2)

        input_extrinsics = input_c2ws[:, :, :12]
        input_intrinsics = torch.stack([
            input_Ks[:, :, 0], input_Ks[:, :, 4], 
            input_Ks[:, :, 2], input_Ks[:, :, 5],
        ], dim=-1)
        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)

        lrm_generator_input['cameras'] = cameras.to(self.device)

        render_c2ws = batch['render_c2ws'].flatten(-2)
        render_Ks = batch['render_Ks'].flatten(-2)
        render_cameras = torch.cat([render_c2ws, render_Ks], dim=-1)

        lrm_generator_input['render_cameras'] = render_cameras.to(self.device)
        lrm_generator_input['render_size'] = 384
        lrm_generator_input['crop_params'] = None

        return lrm_generator_input
    
    def forward_lrm_generator(
        self, 
        images, 
        cameras, 
        render_cameras, 
        render_size=192, 
        crop_params=None, 
        chunk_size=1,
    ):
        planes = torch.utils.checkpoint.checkpoint(
            self.lrm_generator.forward_planes, 
            images, 
            cameras, 
            use_reentrant=False,
        )
        frames = []
        for i in range(0, render_cameras.shape[1], chunk_size):
            frames.append(
                torch.utils.checkpoint.checkpoint(
                    self.lrm_generator.synthesizer,
                    planes,
                    cameras=render_cameras[:, i:i+chunk_size],
                    render_size=render_size, 
                    crop_params=crop_params,
                    use_reentrant=False
                )
            )
        frames = {
            k: torch.cat([r[k] for r in frames], dim=1)
            for k in frames[0].keys()
        }
        return frames
    
    def forward(self, lrm_generator_input):
        images = lrm_generator_input['images']
        cameras = lrm_generator_input['cameras']
        render_cameras = lrm_generator_input['render_cameras']
        render_size = lrm_generator_input['render_size']
        crop_params = lrm_generator_input['crop_params']

        out = self.forward_lrm_generator(
            images, 
            cameras, 
            render_cameras, 
            render_size=render_size, 
            crop_params=crop_params, 
            chunk_size=1,
        )
        render_images = torch.clamp(out['images_rgb'], 0.0, 1.0)
        render_depths = out['images_depth']
        render_alphas = torch.clamp(out['images_weight'], 0.0, 1.0)

        out = {
            'render_images': render_images,
            'render_depths': render_depths,
            'render_alphas': render_alphas,
        }
        return out

    def training_step(self, batch, batch_idx):
        lrm_generator_input, render_gt = self.prepare_batch_data(batch)

        render_out = self.forward(lrm_generator_input)

        loss, loss_dict = self.compute_loss(render_out, render_gt)

        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)

        if self.global_step % 1000 == 0 and self.global_rank == 0:
            B, N, C, H, W = render_gt['target_images'].shape
            N_in = lrm_generator_input['images'].shape[1]

            input_images = v2.functional.resize(
                lrm_generator_input['images'], (H, W), interpolation=3, antialias=True).clamp(0, 1)
            input_images = torch.cat(
                [input_images, torch.ones(B, N-N_in, C, H, W).to(input_images)], dim=1)

            input_images = rearrange(
                input_images, 'b n c h w -> b c h (n w)')
            target_images = rearrange(
                render_gt['target_images'], 'b n c h w -> b c h (n w)')
            render_images = rearrange(
                render_out['render_images'], 'b n c h w -> b c h (n w)')
            target_alphas = rearrange(
                repeat(render_gt['target_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
            render_alphas = rearrange(
                repeat(render_out['render_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
            target_depths = rearrange(
                repeat(render_gt['target_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
            render_depths = rearrange(
                repeat(render_out['render_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
            MAX_DEPTH = torch.max(target_depths)
            target_depths = target_depths / MAX_DEPTH * target_alphas
            render_depths = render_depths / MAX_DEPTH

            grid = torch.cat([
                input_images, 
                target_images, render_images, 
                target_alphas, render_alphas, 
                target_depths, render_depths,
            ], dim=-2)
            grid = make_grid(grid, nrow=target_images.shape[0], normalize=True, value_range=(0, 1))

            save_image(grid, os.path.join(self.logdir, 'images', f'train_{self.global_step:07d}.png'))

        return loss
    
    def compute_loss(self, render_out, render_gt):
        # NOTE: the rgb value range of OpenLRM is [0, 1]
        render_images = render_out['render_images']
        target_images = render_gt['target_images'].to(render_images)
        render_images = rearrange(render_images, 'b n ... -> (b n) ...') * 2.0 - 1.0
        target_images = rearrange(target_images, 'b n ... -> (b n) ...') * 2.0 - 1.0

        loss_mse = F.mse_loss(render_images, target_images)
        loss_lpips = 2.0 * self.lpips(render_images, target_images)

        render_alphas = render_out['render_alphas']
        target_alphas = render_gt['target_alphas']
        loss_mask = F.mse_loss(render_alphas, target_alphas)

        loss = loss_mse + loss_lpips + loss_mask

        prefix = 'train'
        loss_dict = {}
        loss_dict.update({f'{prefix}/loss_mse': loss_mse})
        loss_dict.update({f'{prefix}/loss_lpips': loss_lpips})
        loss_dict.update({f'{prefix}/loss_mask': loss_mask})
        loss_dict.update({f'{prefix}/loss': loss})

        return loss, loss_dict

    @torch.no_grad()
    def validation_step(self, batch, batch_idx):
        lrm_generator_input = self.prepare_validation_batch_data(batch)

        render_out = self.forward(lrm_generator_input)
        render_images = render_out['render_images']
        render_images = rearrange(render_images, 'b n c h w -> b c h (n w)')

        self.validation_step_outputs.append(render_images)
    
    def on_validation_epoch_end(self):
        images = torch.cat(self.validation_step_outputs, dim=-1)

        all_images = self.all_gather(images)
        all_images = rearrange(all_images, 'r b c h w -> (r b) c h w')

        if self.global_rank == 0:
            image_path = os.path.join(self.logdir, 'images_val', f'val_{self.global_step:07d}.png')

            grid = make_grid(all_images, nrow=1, normalize=True, value_range=(0, 1))
            save_image(grid, image_path)
            print(f"Saved image to {image_path}")

        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        lr = self.learning_rate

        params = []

        params.append({"params": self.lrm_generator.parameters(), "lr": lr, "weight_decay": 0.01 })

        optimizer = torch.optim.AdamW(params, lr=lr, betas=(0.90, 0.95))
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 3000, eta_min=lr/10)

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}
init code 9 months ago			`import os`
			`import numpy as np`
			`import torch`
			`import torch.nn.functional as F`
			`from torchvision.transforms import v2`
			`from torchvision.utils import make_grid, save_image`
			`from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity`
			`import pytorch_lightning as pl`
			`from einops import rearrange, repeat`

			`from src.utils.train_util import instantiate_from_config`


			`class MVRecon(pl.LightningModule):`
			`def __init__(`
			`self,`
			`lrm_generator_config,`
			`lrm_path=None,`
			`input_size=256,`
			`render_size=192,`
			`):`
			`super(MVRecon, self).__init__()`

			`self.input_size = input_size`
			`self.render_size = render_size`

			`# init modules`
			`self.lrm_generator = instantiate_from_config(lrm_generator_config)`
			`if lrm_path is not None:`
			`lrm_ckpt = torch.load(lrm_path)`
			`self.lrm_generator.load_state_dict(lrm_ckpt['weights'], strict=False)`

			`self.lpips = LearnedPerceptualImagePatchSimilarity(net_type='vgg')`

			`self.validation_step_outputs = []`

			`def on_fit_start(self):`
			`if self.global_rank == 0:`
			`os.makedirs(os.path.join(self.logdir, 'images'), exist_ok=True)`
			`os.makedirs(os.path.join(self.logdir, 'images_val'), exist_ok=True)`

			`def prepare_batch_data(self, batch):`
			`lrm_generator_input = {}`
			`render_gt = {} # for supervision`

			`# input images`
			`images = batch['input_images']`
			`images = v2.functional.resize(`
			`images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)`

			`lrm_generator_input['images'] = images.to(self.device)`

			`# input cameras and render cameras`
			`input_c2ws = batch['input_c2ws'].flatten(-2)`
			`input_Ks = batch['input_Ks'].flatten(-2)`
			`target_c2ws = batch['target_c2ws'].flatten(-2)`
			`target_Ks = batch['target_Ks'].flatten(-2)`
			`render_cameras_input = torch.cat([input_c2ws, input_Ks], dim=-1)`
			`render_cameras_target = torch.cat([target_c2ws, target_Ks], dim=-1)`
			`render_cameras = torch.cat([render_cameras_input, render_cameras_target], dim=1)`

			`input_extrinsics = input_c2ws[:, :, :12]`
			`input_intrinsics = torch.stack([`
			`input_Ks[:, :, 0], input_Ks[:, :, 4],`
			`input_Ks[:, :, 2], input_Ks[:, :, 5],`
			`], dim=-1)`
			`cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)`

			`# add noise to input cameras`
			`cameras = cameras + torch.rand_like(cameras) * 0.04 - 0.02`

			`lrm_generator_input['cameras'] = cameras.to(self.device)`
			`lrm_generator_input['render_cameras'] = render_cameras.to(self.device)`

			`# target images`
			`target_images = torch.cat([batch['input_images'], batch['target_images']], dim=1)`
			`target_depths = torch.cat([batch['input_depths'], batch['target_depths']], dim=1)`
			`target_alphas = torch.cat([batch['input_alphas'], batch['target_alphas']], dim=1)`

			`# random crop`
			`render_size = np.random.randint(self.render_size, 513)`
			`target_images = v2.functional.resize(`
			`target_images, render_size, interpolation=3, antialias=True).clamp(0, 1)`
			`target_depths = v2.functional.resize(`
			`target_depths, render_size, interpolation=0, antialias=True)`
			`target_alphas = v2.functional.resize(`
			`target_alphas, render_size, interpolation=0, antialias=True)`

			`crop_params = v2.RandomCrop.get_params(`
			`target_images, output_size=(self.render_size, self.render_size))`
			`target_images = v2.functional.crop(target_images, *crop_params)`
			`target_depths = v2.functional.crop(target_depths, *crop_params)[:, :, 0:1]`
			`target_alphas = v2.functional.crop(target_alphas, *crop_params)[:, :, 0:1]`

			`lrm_generator_input['render_size'] = render_size`
			`lrm_generator_input['crop_params'] = crop_params`

			`render_gt['target_images'] = target_images.to(self.device)`
			`render_gt['target_depths'] = target_depths.to(self.device)`
			`render_gt['target_alphas'] = target_alphas.to(self.device)`

			`return lrm_generator_input, render_gt`

			`def prepare_validation_batch_data(self, batch):`
			`lrm_generator_input = {}`

			`# input images`
			`images = batch['input_images']`
			`images = v2.functional.resize(`
			`images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)`

			`lrm_generator_input['images'] = images.to(self.device)`

			`input_c2ws = batch['input_c2ws'].flatten(-2)`
			`input_Ks = batch['input_Ks'].flatten(-2)`

			`input_extrinsics = input_c2ws[:, :, :12]`
			`input_intrinsics = torch.stack([`
			`input_Ks[:, :, 0], input_Ks[:, :, 4],`
			`input_Ks[:, :, 2], input_Ks[:, :, 5],`
			`], dim=-1)`
			`cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)`

			`lrm_generator_input['cameras'] = cameras.to(self.device)`

			`render_c2ws = batch['render_c2ws'].flatten(-2)`
			`render_Ks = batch['render_Ks'].flatten(-2)`
			`render_cameras = torch.cat([render_c2ws, render_Ks], dim=-1)`

			`lrm_generator_input['render_cameras'] = render_cameras.to(self.device)`
			`lrm_generator_input['render_size'] = 384`
			`lrm_generator_input['crop_params'] = None`

			`return lrm_generator_input`

			`def forward_lrm_generator(`
			`self,`
			`images,`
			`cameras,`
			`render_cameras,`
			`render_size=192,`
			`crop_params=None,`
			`chunk_size=1,`
			`):`
			`planes = torch.utils.checkpoint.checkpoint(`
			`self.lrm_generator.forward_planes,`
			`images,`
			`cameras,`
			`use_reentrant=False,`
			`)`
			`frames = []`
			`for i in range(0, render_cameras.shape[1], chunk_size):`
			`frames.append(`
			`torch.utils.checkpoint.checkpoint(`
			`self.lrm_generator.synthesizer,`
			`planes,`
			`cameras=render_cameras[:, i:i+chunk_size],`
			`render_size=render_size,`
			`crop_params=crop_params,`
			`use_reentrant=False`
			`)`
			`)`
			`frames = {`
			`k: torch.cat([r[k] for r in frames], dim=1)`
			`for k in frames[0].keys()`
			`}`
			`return frames`

			`def forward(self, lrm_generator_input):`
			`images = lrm_generator_input['images']`
			`cameras = lrm_generator_input['cameras']`
			`render_cameras = lrm_generator_input['render_cameras']`
			`render_size = lrm_generator_input['render_size']`
			`crop_params = lrm_generator_input['crop_params']`

			`out = self.forward_lrm_generator(`
			`images,`
			`cameras,`
			`render_cameras,`
			`render_size=render_size,`
			`crop_params=crop_params,`
			`chunk_size=1,`
			`)`
			`render_images = torch.clamp(out['images_rgb'], 0.0, 1.0)`
			`render_depths = out['images_depth']`
			`render_alphas = torch.clamp(out['images_weight'], 0.0, 1.0)`

			`out = {`
			`'render_images': render_images,`
			`'render_depths': render_depths,`
			`'render_alphas': render_alphas,`
			`}`
			`return out`

			`def training_step(self, batch, batch_idx):`
			`lrm_generator_input, render_gt = self.prepare_batch_data(batch)`

			`render_out = self.forward(lrm_generator_input)`

			`loss, loss_dict = self.compute_loss(render_out, render_gt)`

			`self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)`

			`if self.global_step % 1000 == 0 and self.global_rank == 0:`
			`B, N, C, H, W = render_gt['target_images'].shape`
			`N_in = lrm_generator_input['images'].shape[1]`

			`input_images = v2.functional.resize(`
			`lrm_generator_input['images'], (H, W), interpolation=3, antialias=True).clamp(0, 1)`
			`input_images = torch.cat(`
			`[input_images, torch.ones(B, N-N_in, C, H, W).to(input_images)], dim=1)`

			`input_images = rearrange(`
			`input_images, 'b n c h w -> b c h (n w)')`
			`target_images = rearrange(`
			`render_gt['target_images'], 'b n c h w -> b c h (n w)')`
			`render_images = rearrange(`
			`render_out['render_images'], 'b n c h w -> b c h (n w)')`
			`target_alphas = rearrange(`
			`repeat(render_gt['target_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')`
			`render_alphas = rearrange(`
			`repeat(render_out['render_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')`
			`target_depths = rearrange(`
			`repeat(render_gt['target_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')`
			`render_depths = rearrange(`
			`repeat(render_out['render_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')`
			`MAX_DEPTH = torch.max(target_depths)`
			`target_depths = target_depths / MAX_DEPTH * target_alphas`
			`render_depths = render_depths / MAX_DEPTH`

			`grid = torch.cat([`
			`input_images,`
			`target_images, render_images,`
			`target_alphas, render_alphas,`
			`target_depths, render_depths,`
			`], dim=-2)`
			`grid = make_grid(grid, nrow=target_images.shape[0], normalize=True, value_range=(0, 1))`

			`save_image(grid, os.path.join(self.logdir, 'images', f'train_{self.global_step:07d}.png'))`

			`return loss`

			`def compute_loss(self, render_out, render_gt):`
			`# NOTE: the rgb value range of OpenLRM is [0, 1]`
			`render_images = render_out['render_images']`
			`target_images = render_gt['target_images'].to(render_images)`
			`render_images = rearrange(render_images, 'b n ... -> (b n) ...') * 2.0 - 1.0`
			`target_images = rearrange(target_images, 'b n ... -> (b n) ...') * 2.0 - 1.0`

			`loss_mse = F.mse_loss(render_images, target_images)`
			`loss_lpips = 2.0 * self.lpips(render_images, target_images)`

			`render_alphas = render_out['render_alphas']`
			`target_alphas = render_gt['target_alphas']`
			`loss_mask = F.mse_loss(render_alphas, target_alphas)`

			`loss = loss_mse + loss_lpips + loss_mask`

			`prefix = 'train'`
			`loss_dict = {}`
			`loss_dict.update({f'{prefix}/loss_mse': loss_mse})`
			`loss_dict.update({f'{prefix}/loss_lpips': loss_lpips})`
			`loss_dict.update({f'{prefix}/loss_mask': loss_mask})`
			`loss_dict.update({f'{prefix}/loss': loss})`

			`return loss, loss_dict`

			`@torch.no_grad()`
			`def validation_step(self, batch, batch_idx):`
			`lrm_generator_input = self.prepare_validation_batch_data(batch)`

			`render_out = self.forward(lrm_generator_input)`
			`render_images = render_out['render_images']`
			`render_images = rearrange(render_images, 'b n c h w -> b c h (n w)')`

			`self.validation_step_outputs.append(render_images)`

			`def on_validation_epoch_end(self):`
			`images = torch.cat(self.validation_step_outputs, dim=-1)`

			`all_images = self.all_gather(images)`
			`all_images = rearrange(all_images, 'r b c h w -> (r b) c h w')`

			`if self.global_rank == 0:`
			`image_path = os.path.join(self.logdir, 'images_val', f'val_{self.global_step:07d}.png')`

			`grid = make_grid(all_images, nrow=1, normalize=True, value_range=(0, 1))`
			`save_image(grid, image_path)`
			`print(f"Saved image to {image_path}")`

			`self.validation_step_outputs.clear()`

			`def configure_optimizers(self):`
			`lr = self.learning_rate`

			`params = []`

add training config 9 months ago			`params.append({"params": self.lrm_generator.parameters(), "lr": lr, "weight_decay": 0.01 })`
init code 9 months ago
			`optimizer = torch.optim.AdamW(params, lr=lr, betas=(0.90, 0.95))`
add training config 9 months ago			`scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 3000, eta_min=lr/10)`
init code 9 months ago
			`return {'optimizer': optimizer, 'lr_scheduler': scheduler}`