import math from functools import lru_cache from typing import Optional import torch import torch.nn as nn def encode_position(version: str, *, position: torch.Tensor): if version == "v1": freqs = get_scales(0, 10, position.dtype, position.device).view(1, -1) freqs = position.reshape(-1, 1) * freqs return torch.cat([freqs.cos(), freqs.sin()], dim=1).reshape(*position.shape[:-1], -1) elif version == "nerf": return posenc_nerf(position, min_deg=0, max_deg=15) else: raise ValueError(version) def encode_channels(version: str, *, channels: torch.Tensor): if version == "v1": freqs = get_scales(0, 10, channels.dtype, channels.device).view(1, -1) freqs = channels.reshape(-1, 1) * freqs return torch.cat([freqs.cos(), freqs.sin()], dim=1).reshape(*channels.shape[:-1], -1) elif version == "nerf": return posenc_nerf(channels, min_deg=0, max_deg=15) else: raise ValueError(version) def position_encoding_channels(version: Optional[str] = None) -> int: if version is None: return 1 return encode_position(version, position=torch.zeros(1, 1)).shape[-1] def channel_encoding_channels(version: Optional[str] = None) -> int: if version is None: return 1 return encode_channels(version, channels=torch.zeros(1, 1)).shape[-1] class PosEmbLinear(nn.Linear): def __init__( self, posemb_version: Optional[str], in_features: int, out_features: int, **kwargs ): super().__init__( in_features * position_encoding_channels(posemb_version), out_features, **kwargs, ) self.posemb_version = posemb_version def forward(self, x: torch.Tensor): if self.posemb_version is not None: x = encode_position(self.posemb_version, position=x) return super().forward(x) class MultiviewPoseEmbedding(nn.Conv2d): def __init__( self, posemb_version: Optional[str], n_channels: int, out_features: int, stride: int = 1, **kwargs, ): in_features = ( n_channels * channel_encoding_channels(version=posemb_version) + 3 * position_encoding_channels(version=posemb_version) + 3 * position_encoding_channels(version=posemb_version) ) super().__init__( in_features, out_features, kernel_size=3, stride=stride, padding=1, **kwargs, ) self.posemb_version = posemb_version def forward( self, channels: torch.Tensor, position: torch.Tensor, direction: torch.Tensor ) -> torch.Tensor: """ :param channels: [batch_shape, inner_batch_shape, n_channels, height, width] :param position: [batch_shape, inner_batch_shape, 3, height, width] :param direction: [batch_shape, inner_batch_shape, 3, height, width] :return: [*batch_shape, out_features, height, width] """ if self.posemb_version is not None: channels = channels.permute(0, 1, 3, 4, 2) position = position.permute(0, 1, 3, 4, 2) direction = direction.permute(0, 1, 3, 4, 2) channels = encode_channels(self.posemb_version, channels=channels).permute( 0, 1, 4, 2, 3 ) direction = maybe_encode_direction( self.posemb_version, position=position, direction=direction ).permute(0, 1, 4, 2, 3) position = encode_position(self.posemb_version, position=position).permute( 0, 1, 4, 2, 3 ) x = torch.cat([channels, position, direction], dim=-3) *batch_shape, in_features, height, width = x.shape return ( super() .forward(x.view(-1, in_features, height, width)) .view(*batch_shape, -1, height, width) ) class MultiviewPointCloudEmbedding(nn.Conv2d): def __init__( self, posemb_version: Optional[str], n_channels: int, out_features: int, stride: int = 1, **kwargs, ): in_features = ( n_channels * channel_encoding_channels(version=posemb_version) + 3 * position_encoding_channels(version=posemb_version) + 3 * position_encoding_channels(version=posemb_version) ) super().__init__( in_features, out_features, kernel_size=3, stride=stride, padding=1, **kwargs, ) self.posemb_version = posemb_version self.register_parameter( "unk_token", nn.Parameter(torch.randn(in_features, **kwargs) * 0.01) ) self.unk_token: torch.Tensor def forward( self, channels: torch.Tensor, origin: torch.Tensor, position: torch.Tensor, mask: torch.Tensor, ) -> torch.Tensor: """ :param channels: [batch_shape, inner_batch_shape, n_channels, height, width] :param origin: [batch_shape, inner_batch_shape, 3, height, width] :param position: [batch_shape, inner_batch_shape, 3, height, width] :return: [*batch_shape, out_features, height, width] """ if self.posemb_version is not None: channels = channels.permute(0, 1, 3, 4, 2) origin = origin.permute(0, 1, 3, 4, 2) position = position.permute(0, 1, 3, 4, 2) channels = encode_channels(self.posemb_version, channels=channels).permute( 0, 1, 4, 2, 3 ) origin = encode_position(self.posemb_version, position=origin).permute(0, 1, 4, 2, 3) position = encode_position(self.posemb_version, position=position).permute( 0, 1, 4, 2, 3 ) x = torch.cat([channels, origin, position], dim=-3) unk_token = torch.broadcast_to(self.unk_token.view(1, 1, -1, 1, 1), x.shape) x = torch.where(mask, x, unk_token) *batch_shape, in_features, height, width = x.shape return ( super() .forward(x.view(-1, in_features, height, width)) .view(*batch_shape, -1, height, width) ) def maybe_encode_direction( version: str, *, position: torch.Tensor, direction: Optional[torch.Tensor] = None, ): if version == "v1": sh_degree = 4 if direction is None: return torch.zeros(*position.shape[:-1], sh_degree**2).to(position) return spherical_harmonics_basis(direction, sh_degree=sh_degree) elif version == "nerf": if direction is None: return torch.zeros_like(posenc_nerf(position, min_deg=0, max_deg=8)) return posenc_nerf(direction, min_deg=0, max_deg=8) else: raise ValueError(version) def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor: """ Concatenate x and its positional encodings, following NeRF. Reference: https://arxiv.org/pdf/2210.04628.pdf """ if min_deg == max_deg: return x scales = get_scales(min_deg, max_deg, x.dtype, x.device) *shape, dim = x.shape xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1) assert xb.shape[-1] == dim * (max_deg - min_deg) emb = torch.cat([xb, xb + math.pi / 2.0], axis=-1).sin() return torch.cat([x, emb], dim=-1) @lru_cache def get_scales( min_deg: int, max_deg: int, dtype: torch.dtype, device: torch.device, ) -> torch.Tensor: return 2.0 ** torch.arange(min_deg, max_deg, device=device, dtype=dtype) def spherical_harmonics_basis( coords: torch.Tensor, sh_degree: int, ) -> torch.Tensor: """ Calculate the spherical harmonics basis :param coords: [batch_size, *shape, 3] of unit norm :param sh_degree: Spherical harmonics degree :return: [batch_size, *shape, sh_degree**2] """ if sh_degree > 8: raise NotImplementedError batch_size, *shape, _ = coords.shape x, y, z = coords.reshape(-1, 3).split(1, dim=-1) x = x.squeeze(dim=-1) y = y.squeeze(dim=-1) z = z.squeeze(dim=-1) xy, xz, yz = x * y, x * z, y * z x2, y2, z2 = x * x, y * y, z * z x4, y4, z4 = x2 * x2, y2 * y2, z2 * z2 x6, y6, z6 = x4 * x2, y4 * y2, z4 * z2 xyz = xy * z # https://github.com/NVlabs/tiny-cuda-nn/blob/8575542682cb67cddfc748cc3d3cfc12593799aa/include/tiny-cuda-nn/encodings/spherical_harmonics.h#L76 out = torch.zeros(x.shape[0], sh_degree**2, dtype=x.dtype, device=x.device) def _sh(): out[:, 0] = 0.28209479177387814 # 1/(2*sqrt(pi)) if sh_degree <= 1: return out[:, 1] = -0.48860251190291987 * y # -sqrt(3)*y/(2*sqrt(pi)) out[:, 2] = 0.48860251190291987 * z # sqrt(3)*z/(2*sqrt(pi)) out[:, 3] = -0.48860251190291987 * x # -sqrt(3)*x/(2*sqrt(pi)) if sh_degree <= 2: return out[:, 4] = 1.0925484305920792 * xy # sqrt(15)*xy/(2*sqrt(pi)) out[:, 5] = -1.0925484305920792 * yz # -sqrt(15)*yz/(2*sqrt(pi)) out[:, 6] = ( 0.94617469575755997 * z2 - 0.31539156525251999 ) # sqrt(5)*(3*z2 - 1)/(4*sqrt(pi)) out[:, 7] = -1.0925484305920792 * xz # -sqrt(15)*xz/(2*sqrt(pi)) out[:, 8] = ( 0.54627421529603959 * x2 - 0.54627421529603959 * y2 ) # sqrt(15)*(x2 - y2)/(4*sqrt(pi)) if sh_degree <= 3: return out[:, 9] = ( 0.59004358992664352 * y * (-3.0 * x2 + y2) ) # sqrt(70)*y*(-3*x2 + y2)/(8*sqrt(pi)) out[:, 10] = 2.8906114426405538 * xy * z # sqrt(105)*xy*z/(2*sqrt(pi)) out[:, 11] = ( 0.45704579946446572 * y * (1.0 - 5.0 * z2) ) # sqrt(42)*y*(1 - 5*z2)/(8*sqrt(pi)) out[:, 12] = 0.3731763325901154 * z * (5.0 * z2 - 3.0) # sqrt(7)*z*(5*z2 - 3)/(4*sqrt(pi)) out[:, 13] = ( 0.45704579946446572 * x * (1.0 - 5.0 * z2) ) # sqrt(42)*x*(1 - 5*z2)/(8*sqrt(pi)) out[:, 14] = 1.4453057213202769 * z * (x2 - y2) # sqrt(105)*z*(x2 - y2)/(4*sqrt(pi)) out[:, 15] = ( 0.59004358992664352 * x * (-x2 + 3.0 * y2) ) # sqrt(70)*x*(-x2 + 3*y2)/(8*sqrt(pi)) if sh_degree <= 4: return out[:, 16] = 2.5033429417967046 * xy * (x2 - y2) # 3*sqrt(35)*xy*(x2 - y2)/(4*sqrt(pi)) out[:, 17] = ( 1.7701307697799304 * yz * (-3.0 * x2 + y2) ) # 3*sqrt(70)*yz*(-3*x2 + y2)/(8*sqrt(pi)) out[:, 18] = ( 0.94617469575756008 * xy * (7.0 * z2 - 1.0) ) # 3*sqrt(5)*xy*(7*z2 - 1)/(4*sqrt(pi)) out[:, 19] = ( 0.66904654355728921 * yz * (3.0 - 7.0 * z2) ) # 3*sqrt(10)*yz*(3 - 7*z2)/(8*sqrt(pi)) out[:, 20] = ( -3.1735664074561294 * z2 + 3.7024941420321507 * z4 + 0.31735664074561293 ) # 3*(-30*z2 + 35*z4 + 3)/(16*sqrt(pi)) out[:, 21] = ( 0.66904654355728921 * xz * (3.0 - 7.0 * z2) ) # 3*sqrt(10)*xz*(3 - 7*z2)/(8*sqrt(pi)) out[:, 22] = ( 0.47308734787878004 * (x2 - y2) * (7.0 * z2 - 1.0) ) # 3*sqrt(5)*(x2 - y2)*(7*z2 - 1)/(8*sqrt(pi)) out[:, 23] = ( 1.7701307697799304 * xz * (-x2 + 3.0 * y2) ) # 3*sqrt(70)*xz*(-x2 + 3*y2)/(8*sqrt(pi)) out[:, 24] = ( -3.7550144126950569 * x2 * y2 + 0.62583573544917614 * x4 + 0.62583573544917614 * y4 ) # 3*sqrt(35)*(-6*x2*y2 + x4 + y4)/(16*sqrt(pi)) if sh_degree <= 5: return out[:, 25] = ( 0.65638205684017015 * y * (10.0 * x2 * y2 - 5.0 * x4 - y4) ) # 3*sqrt(154)*y*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi)) out[:, 26] = ( 8.3026492595241645 * xy * z * (x2 - y2) ) # 3*sqrt(385)*xy*z*(x2 - y2)/(4*sqrt(pi)) out[:, 27] = ( -0.48923829943525038 * y * (3.0 * x2 - y2) * (9.0 * z2 - 1.0) ) # -sqrt(770)*y*(3*x2 - y2)*(9*z2 - 1)/(32*sqrt(pi)) out[:, 28] = ( 4.7935367849733241 * xy * z * (3.0 * z2 - 1.0) ) # sqrt(1155)*xy*z*(3*z2 - 1)/(4*sqrt(pi)) out[:, 29] = ( 0.45294665119569694 * y * (14.0 * z2 - 21.0 * z4 - 1.0) ) # sqrt(165)*y*(14*z2 - 21*z4 - 1)/(16*sqrt(pi)) out[:, 30] = ( 0.1169503224534236 * z * (-70.0 * z2 + 63.0 * z4 + 15.0) ) # sqrt(11)*z*(-70*z2 + 63*z4 + 15)/(16*sqrt(pi)) out[:, 31] = ( 0.45294665119569694 * x * (14.0 * z2 - 21.0 * z4 - 1.0) ) # sqrt(165)*x*(14*z2 - 21*z4 - 1)/(16*sqrt(pi)) out[:, 32] = ( 2.3967683924866621 * z * (x2 - y2) * (3.0 * z2 - 1.0) ) # sqrt(1155)*z*(x2 - y2)*(3*z2 - 1)/(8*sqrt(pi)) out[:, 33] = ( -0.48923829943525038 * x * (x2 - 3.0 * y2) * (9.0 * z2 - 1.0) ) # -sqrt(770)*x*(x2 - 3*y2)*(9*z2 - 1)/(32*sqrt(pi)) out[:, 34] = ( 2.0756623148810411 * z * (-6.0 * x2 * y2 + x4 + y4) ) # 3*sqrt(385)*z*(-6*x2*y2 + x4 + y4)/(16*sqrt(pi)) out[:, 35] = ( 0.65638205684017015 * x * (10.0 * x2 * y2 - x4 - 5.0 * y4) ) # 3*sqrt(154)*x*(10*x2*y2 - x4 - 5*y4)/(32*sqrt(pi)) if sh_degree <= 6: return out[:, 36] = ( 1.3663682103838286 * xy * (-10.0 * x2 * y2 + 3.0 * x4 + 3.0 * y4) ) # sqrt(6006)*xy*(-10*x2*y2 + 3*x4 + 3*y4)/(32*sqrt(pi)) out[:, 37] = ( 2.3666191622317521 * yz * (10.0 * x2 * y2 - 5.0 * x4 - y4) ) # 3*sqrt(2002)*yz*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi)) out[:, 38] = ( 2.0182596029148963 * xy * (x2 - y2) * (11.0 * z2 - 1.0) ) # 3*sqrt(91)*xy*(x2 - y2)*(11*z2 - 1)/(8*sqrt(pi)) out[:, 39] = ( -0.92120525951492349 * yz * (3.0 * x2 - y2) * (11.0 * z2 - 3.0) ) # -sqrt(2730)*yz*(3*x2 - y2)*(11*z2 - 3)/(32*sqrt(pi)) out[:, 40] = ( 0.92120525951492349 * xy * (-18.0 * z2 + 33.0 * z4 + 1.0) ) # sqrt(2730)*xy*(-18*z2 + 33*z4 + 1)/(32*sqrt(pi)) out[:, 41] = ( 0.58262136251873131 * yz * (30.0 * z2 - 33.0 * z4 - 5.0) ) # sqrt(273)*yz*(30*z2 - 33*z4 - 5)/(16*sqrt(pi)) out[:, 42] = ( 6.6747662381009842 * z2 - 20.024298714302954 * z4 + 14.684485723822165 * z6 - 0.31784601133814211 ) # sqrt(13)*(105*z2 - 315*z4 + 231*z6 - 5)/(32*sqrt(pi)) out[:, 43] = ( 0.58262136251873131 * xz * (30.0 * z2 - 33.0 * z4 - 5.0) ) # sqrt(273)*xz*(30*z2 - 33*z4 - 5)/(16*sqrt(pi)) out[:, 44] = ( 0.46060262975746175 * (x2 - y2) * (11.0 * z2 * (3.0 * z2 - 1.0) - 7.0 * z2 + 1.0) ) # sqrt(2730)*(x2 - y2)*(11*z2*(3*z2 - 1) - 7*z2 + 1)/(64*sqrt(pi)) out[:, 45] = ( -0.92120525951492349 * xz * (x2 - 3.0 * y2) * (11.0 * z2 - 3.0) ) # -sqrt(2730)*xz*(x2 - 3*y2)*(11*z2 - 3)/(32*sqrt(pi)) out[:, 46] = ( 0.50456490072872406 * (11.0 * z2 - 1.0) * (-6.0 * x2 * y2 + x4 + y4) ) # 3*sqrt(91)*(11*z2 - 1)*(-6*x2*y2 + x4 + y4)/(32*sqrt(pi)) out[:, 47] = ( 2.3666191622317521 * xz * (10.0 * x2 * y2 - x4 - 5.0 * y4) ) # 3*sqrt(2002)*xz*(10*x2*y2 - x4 - 5*y4)/(32*sqrt(pi)) out[:, 48] = ( 10.247761577878714 * x2 * y4 - 10.247761577878714 * x4 * y2 + 0.6831841051919143 * x6 - 0.6831841051919143 * y6 ) # sqrt(6006)*(15*x2*y4 - 15*x4*y2 + x6 - y6)/(64*sqrt(pi)) if sh_degree <= 7: return out[:, 49] = ( 0.70716273252459627 * y * (-21.0 * x2 * y4 + 35.0 * x4 * y2 - 7.0 * x6 + y6) ) # 3*sqrt(715)*y*(-21*x2*y4 + 35*x4*y2 - 7*x6 + y6)/(64*sqrt(pi)) out[:, 50] = ( 5.2919213236038001 * xy * z * (-10.0 * x2 * y2 + 3.0 * x4 + 3.0 * y4) ) # 3*sqrt(10010)*xy*z*(-10*x2*y2 + 3*x4 + 3*y4)/(32*sqrt(pi)) out[:, 51] = ( -0.51891557872026028 * y * (13.0 * z2 - 1.0) * (-10.0 * x2 * y2 + 5.0 * x4 + y4) ) # -3*sqrt(385)*y*(13*z2 - 1)*(-10*x2*y2 + 5*x4 + y4)/(64*sqrt(pi)) out[:, 52] = ( 4.1513246297620823 * xy * z * (x2 - y2) * (13.0 * z2 - 3.0) ) # 3*sqrt(385)*xy*z*(x2 - y2)*(13*z2 - 3)/(8*sqrt(pi)) out[:, 53] = ( -0.15645893386229404 * y * (3.0 * x2 - y2) * (13.0 * z2 * (11.0 * z2 - 3.0) - 27.0 * z2 + 3.0) ) # -3*sqrt(35)*y*(3*x2 - y2)*(13*z2*(11*z2 - 3) - 27*z2 + 3)/(64*sqrt(pi)) out[:, 54] = ( 0.44253269244498261 * xy * z * (-110.0 * z2 + 143.0 * z4 + 15.0) ) # 3*sqrt(70)*xy*z*(-110*z2 + 143*z4 + 15)/(32*sqrt(pi)) out[:, 55] = ( 0.090331607582517306 * y * (-135.0 * z2 + 495.0 * z4 - 429.0 * z6 + 5.0) ) # sqrt(105)*y*(-135*z2 + 495*z4 - 429*z6 + 5)/(64*sqrt(pi)) out[:, 56] = ( 0.068284276912004949 * z * (315.0 * z2 - 693.0 * z4 + 429.0 * z6 - 35.0) ) # sqrt(15)*z*(315*z2 - 693*z4 + 429*z6 - 35)/(32*sqrt(pi)) out[:, 57] = ( 0.090331607582517306 * x * (-135.0 * z2 + 495.0 * z4 - 429.0 * z6 + 5.0) ) # sqrt(105)*x*(-135*z2 + 495*z4 - 429*z6 + 5)/(64*sqrt(pi)) out[:, 58] = ( 0.07375544874083044 * z * (x2 - y2) * (143.0 * z2 * (3.0 * z2 - 1.0) - 187.0 * z2 + 45.0) ) # sqrt(70)*z*(x2 - y2)*(143*z2*(3*z2 - 1) - 187*z2 + 45)/(64*sqrt(pi)) out[:, 59] = ( -0.15645893386229404 * x * (x2 - 3.0 * y2) * (13.0 * z2 * (11.0 * z2 - 3.0) - 27.0 * z2 + 3.0) ) # -3*sqrt(35)*x*(x2 - 3*y2)*(13*z2*(11*z2 - 3) - 27*z2 + 3)/(64*sqrt(pi)) out[:, 60] = ( 1.0378311574405206 * z * (13.0 * z2 - 3.0) * (-6.0 * x2 * y2 + x4 + y4) ) # 3*sqrt(385)*z*(13*z2 - 3)*(-6*x2*y2 + x4 + y4)/(32*sqrt(pi)) out[:, 61] = ( -0.51891557872026028 * x * (13.0 * z2 - 1.0) * (-10.0 * x2 * y2 + x4 + 5.0 * y4) ) # -3*sqrt(385)*x*(13*z2 - 1)*(-10*x2*y2 + x4 + 5*y4)/(64*sqrt(pi)) out[:, 62] = ( 2.6459606618019 * z * (15.0 * x2 * y4 - 15.0 * x4 * y2 + x6 - y6) ) # 3*sqrt(10010)*z*(15*x2*y4 - 15*x4*y2 + x6 - y6)/(64*sqrt(pi)) out[:, 63] = ( 0.70716273252459627 * x * (-35.0 * x2 * y4 + 21.0 * x4 * y2 - x6 + 7.0 * y6) ) # 3*sqrt(715)*x*(-35*x2*y4 + 21*x4*y2 - x6 + 7*y6)/(64*sqrt(pi)) _sh() return out.view(batch_size, *shape, sh_degree**2)