Assignment 3¶

Sayan Mondal¶

Number of late days used¶

Late days

1. Differentiable Volume Rendering¶

1.3. Ray sampling (10 points)¶

Implementation
1. get_pixels_from_image in ray_utils.py and
2. get_rays_from_pixels in ray_utils.py

In [ ]:
# Generate pixel coordinates from in NDC space (from [-1, 1])
def get_pixels_from_image(image_size, camera):
    W, H = image_size[0], image_size[1]

    # TODO (1.3): Generate pixel coordinates from [0, W] in x and [0, H] in y
    x = torch.arange(0,W)
    y = torch.arange(0,H)
    # pass

    # TODO (1.3): Convert to the range [-1, 1] in both x and y
    x = (2.0 * x / W) - 1.0
    y = (2.0 * y / H) - 1.0
    # pass 

    # Create grid of coordinates
    xy_grid = torch.stack(
        tuple( reversed( torch.meshgrid(y, x) ) ),
        dim=-1,
    ).view(W * H, 2)

    return -xy_grid
In [ ]:
# Get rays from pixel values
def get_rays_from_pixels(xy_grid, image_size, camera):
    W, H = image_size[0], image_size[1]

    # TODO (1.3): Map pixels to points on the image plane at Z=1
    device = get_device()
    ndc_points = xy_grid.to(device)
    # pass

    ndc_points = torch.cat(
        [
            ndc_points,
            torch.ones_like(ndc_points[..., -1:])
        ],
        dim=-1
    )

    # TODO (1.3): Use camera.unproject to get world space points on the image plane from NDC space points
    ndc_to_world = camera.unproject_points(ndc_points, world_coordinates=True)
    # pass

    # TODO (1.3): Get ray origins from camera center
    rays_o = camera.get_camera_center() * torch.ones_like(ndc_points)
    # pass

    # TODO (1.3): Get normalized ray directions
    rays_d = torch.nn.functional.normalize(ndc_to_world - rays_o)
    # pass

    # Create and return RayBundle
    return RayBundle(
        rays_o,
        rays_d,
        torch.zeros_like(rays_o).unsqueeze(1),
        torch.zeros_like(rays_o).unsqueeze(1),
    )

Visualization

Code in main.py:

In [ ]:
# TODO (1.3): Visualize xy grid using vis_grid
    W, H = image_size[0], image_size[1]

    if cam_idx == 0 and file_prefix == '':
        # pass
        image = xy_grid.cpu().numpy().reshape(W,H,2)
        image  = (image + 1 )/ 2.0      
        image = np.absolute(image)
        image = np.concatenate((image, np.zeros((W,H,1))), axis = -1)   
        plt.figure(figsize=(10, 10))
        plt.imshow(image)
        plt.imsave('vis_grid.png',image)
        plt.axis("off")

    # TODO (1.3): Visualize rays using vis_rays
    if cam_idx == 0 and file_prefix == '':
        # pass
        rays_d = ray_bundle.directions
        image = rays_d.cpu().numpy().reshape(W,H,3)
        image = np.absolute(image)
        plt.figure(figsize=(10, 10))
        plt.imshow(image)
        plt.imsave('vis_rays.png',image)
        plt.axis("off")

xy_grid visualization:

vis_grid.png

rays visualization:

vis_rays.png

1.4. Point sampling (10 points)¶

Implementation
To fill out StratifiedSampler in sampler.py.

In [ ]:
# Sampler which implements stratified (uniform) point sampling along rays
class StratifiedRaysampler(torch.nn.Module):
    def __init__(
        self,
        cfg
    ):
        super().__init__()

        self.n_pts_per_ray = cfg.n_pts_per_ray
        self.min_depth = cfg.min_depth
        self.max_depth = cfg.max_depth

    def forward(
        self,
        ray_bundle,
    ):
        # TODO (1.4): Compute z values for self.n_pts_per_ray points uniformly sampled between [near, far]
        z_vals = torch.linspace(self.min_depth, self.max_depth, self.n_pts_per_ray, device = get_device())

        # TODO (1.4): Sample points from z values
        num_rays = ray_bundle.origins.shape[0]                                     # 65536
        sample_points = torch.zeros((num_rays, self.n_pts_per_ray, 3), device = get_device())  # (65536, 64, 3)

        # p = z * dir + p0
        
        z_vals = torch.ones((num_rays,1)).to(get_device()) @ z_vals.reshape(1, -1)   # torch.Size([65536, 64])
        z_vals = z_vals.unsqueeze(2)       # torch.Size([65536, 64, 1])
        
        for i in range(self.n_pts_per_ray):
            sample_points[:,i,:] = ray_bundle.directions

        sample_points = z_vals * sample_points
        for i in range(self.n_pts_per_ray):
            sample_points[:,i,:] = sample_points[:,i,:] + ray_bundle.origins

        # Return
        return ray_bundle._replace(
            sample_points=sample_points,
            sample_lengths=z_vals * torch.ones_like(sample_points[..., :1]),
        )

Visualization

Code in main.py:

In [ ]:
# TODO (1.4): Implement point sampling along rays in sampler.py
        ray_bundle_modified = model.sampler.forward(ray_bundle)
        # pass

        # TODO (1.4): Visualize sample points as point cloud
        if cam_idx == 0 and file_prefix == '':
            render_points('render_points.png', ray_bundle_modified.sample_points.view(-1,3)[None],
                          image_size=256, color=[0.7, 0.7, 1], device=device)

Point samples from the first camera:

render_points.png

1.5. Volume rendering (30 points)¶

Implementation
1. VolumeRenderer._compute_weights
2. VolumeRenderer._aggregate
3. Modify the VolumeRenderer.forward method to render a depth map in addition to color from a volume.

In [ ]:
def _compute_weights(
        self,
        deltas,
        rays_density: torch.Tensor,
        eps: float = 1e-10
    ):
        # TODO (1.5): Compute transmittance using the equation described in the README
        # pass
        num_rays, n_sample_per_ray = deltas.shape[0], deltas.shape[1]

        T = torch.ones(num_rays, n_sample_per_ray,1).to(get_device())
        factor = torch.exp(-(deltas * rays_density))
        
        for i in range(1, n_sample_per_ray):
            T[:,i,:] = T[:,i-1,:].clone() * factor[:,i-1,:]

        # TODO (1.5): Compute weight used for rendering from transmittance and density
        weights = T * (1-factor)

        return weights
In [ ]:
def _aggregate(
        self,
        weights: torch.Tensor,
        rays_feature: torch.Tensor
    ):
        # TODO (1.5): Aggregate (weighted sum of) features using weights
        # pass
        feature = torch.sum(weights * rays_feature, dim=1)

        return feature
In [ ]:
def forward(
        self,
        sampler,
        implicit_fn,
        ray_bundle,
    ):
        B = ray_bundle.shape[0]

        # Process the chunks of rays.
        chunk_outputs = []

        for chunk_start in range(0, B, self._chunk_size):
            cur_ray_bundle = ray_bundle[chunk_start:chunk_start+self._chunk_size]

            # Sample points along the ray
            cur_ray_bundle = sampler(cur_ray_bundle)
            n_pts = cur_ray_bundle.sample_shape[1]

            # Call implicit function with sample points
            implicit_output = implicit_fn(cur_ray_bundle)
            density = implicit_output['density']
            feature = implicit_output['feature']

            # Compute length of each ray segment
            depth_values = cur_ray_bundle.sample_lengths[..., 0]
            deltas = torch.cat(
                (
                    depth_values[..., 1:] - depth_values[..., :-1],
                    1e10 * torch.ones_like(depth_values[..., :1]),
                ),
                dim=-1,
            )[..., None]

            # Compute aggregation weights
            weights = self._compute_weights(
                deltas.view(-1, n_pts, 1),
                density.view(-1, n_pts, 1)
            ) 

            # TODO (1.5): Render (color) features using weights
            # pass
            feature = self._aggregate(weights, feature.view(-1, n_pts, 3))

            # TODO (1.5): Render depth map
            # pass
            depth = self._aggregate(weights, depth_values.view(-1, n_pts, 1))

            # Return
            cur_out = {
                'feature': feature,
                'depth': depth,
            }

            chunk_outputs.append(cur_out)

        # Concatenate chunk outputs
        out = {
            k: torch.cat(
              [chunk_out[k] for chunk_out in chunk_outputs],
              dim=0
            ) for k in chunk_outputs[0].keys()
        }

        return out

Visualization

In [ ]:
# TODO (1.5): Implement rendering in renderer.py
out = model(ray_bundle)

# Return rendered features (colors)
image = np.array(out['feature'].view(image_size[1], image_size[0], 3).detach().cpu())
all_images.append(image)

# TODO (1.5): Visualize depth
if cam_idx == 2 and file_prefix == '':
# pass
    depth = np.array(out['depth'].view(image_size[1], image_size[0]).detach().cpu())
    plt.imsave(f'{file_prefix}_{cam_idx}_depth.png',depth, cmap = 'plasma')

part_1.gif

Depth Map:

_2_depth.png

2. Optimizing a basic implicit volume¶

2.1. Random ray sampling (5 points)¶

Implement the get_random_pixels_from_image method in ray_utils.py

In [ ]:
def get_random_pixels_from_image(n_pixels, image_size, camera):
    xy_grid = get_pixels_from_image(image_size, camera)
    
    # TODO (2.1): Random subsampling of pixel coordinates
    x_sampled = torch.rand(n_pixels)*2.0 - 1.0
    y_sampled = torch.rand(n_pixels)*2.0 - 1.0
    xy_grid_sub = torch.stack((x_sampled,y_sampled),dim = 1).to(get_device())

    # Return
    return xy_grid_sub.reshape(-1, 2)[:n_pixels]

2.2. Loss and training (5 points)¶

Replace the loss in train with mean squared error between the predicted colors and ground truth colors rgb_gt

In [ ]:
# TODO (2.2): Calculate loss

predicted = out['feature']
# print('out', predicted.shape)
mse_loss = torch.nn.MSELoss()
loss = mse_loss(predicted, rgb_gt)

Report the center of the box, and the side lengths of the box after training, rounded to the nearest 1/100 decimal place.

Box center: [0.25, 0.25, 0.00].
Box side lengths: [2.00, 1.50, 1.50]

Visualization

Render of a spiral sequence of the optimized volume:

part_2.gif

3. Optimizing a Neural Radiance Field (NeRF) (30 points)¶

Implementation
An implicit volume as a Multi-Layer Perceptron (MLP) in the NeuraRadianceField class in implicit.py.
Fill out the loss in train_nerf in the main.py file.

In [ ]:
# TODO (3.1): Implement NeRF MLP
class NeuralRadianceField(torch.nn.Module):
    def __init__(
        self,
        cfg,
    ):
        super().__init__()

        self.harmonic_embedding_xyz = HarmonicEmbedding(3, cfg.n_harmonic_functions_xyz)
        self.harmonic_embedding_dir = HarmonicEmbedding(3, cfg.n_harmonic_functions_dir)

        embedding_dim_xyz = self.harmonic_embedding_xyz.output_dim
        embedding_dim_dir = self.harmonic_embedding_dir.output_dim

        # pass
        self.cfg = cfg
       
        embedding_dim = embedding_dim_xyz                      # 39
        hidden_dim = cfg.n_hidden_neurons_xyz                  # 128

        self.in_layer = torch.nn.Linear(embedding_dim, hidden_dim)
        self.hidden = torch.nn.Linear(hidden_dim, hidden_dim)
        
        # seperate output tail
        self.out_density = torch.nn.Linear(hidden_dim, 1)
        self.out_color = torch.nn.Linear(hidden_dim, 3)

        self.relu = torch.nn.functional.relu
        self.sigmoid = torch.sigmoid

    def forward(self, ray_bundle):
        sample_points = ray_bundle.sample_points.view(-1, 3)
        embedded_xyz = self.harmonic_embedding_xyz(sample_points)
        
        x = embedded_xyz                                 # torch.Size([131072, 39])
       
        x = self.in_layer(x)                               # torch.Size([131072, 128])
        # print(f"x_shape_after 1 pass: {x.shape}")                               

        for _ in range(self.cfg.n_layers_xyz-2):
            x = self.relu(self.hidden(x))

        density = self.relu(self.out_density(x))
        color = self.sigmoid(self.out_color(x))

        out = {
            'density': density,
            'feature': color,
        }

        return out
In [ ]:
# TODO (3.1): Calculate loss

predicted = out['feature']
mse_loss = torch.nn.MSELoss()
loss = mse_loss(predicted, rgb_gt)

Visualization

part_3.gif

4. NeRF Extras (Choose at least one! More than one is extra credit)¶

4.1 View Dependence (10 pts)¶

Code of the network:

In [ ]:
class NeuralRadianceField(torch.nn.Module):
    def __init__(
        self,
        cfg,
    ):
        super().__init__()

        self.harmonic_embedding_xyz = HarmonicEmbedding(3, cfg.n_harmonic_functions_xyz)
        self.harmonic_embedding_dir = HarmonicEmbedding(3, cfg.n_harmonic_functions_dir)

        embedding_dim_xyz = self.harmonic_embedding_xyz.output_dim
        embedding_dim_dir = self.harmonic_embedding_dir.output_dim

        # pass
        self.cfg = cfg
       
        embedding_dim = embedding_dim_dir + embedding_dim_xyz       # 54
        # print(f"embedding_dim: {embedding_dim}")

        hidden_dim = cfg.n_hidden_neurons_xyz             # 128

        self.in_layer = torch.nn.Linear(embedding_dim, hidden_dim)
        self.hidden = torch.nn.Linear(hidden_dim, hidden_dim)
        
        self.out_density = torch.nn.Linear(hidden_dim, 1)
        self.out_color = torch.nn.Linear(hidden_dim, 3)

        self.relu = torch.nn.functional.relu
        self.sigmoid = torch.sigmoid

    def forward(self, ray_bundle):
        sample_points = ray_bundle.sample_points.view(-1, 3)
        embedded_xyz = self.harmonic_embedding_xyz(sample_points)
        
        directions = ray_bundle.directions.view(-1, 3)            #4.1
        embedded_dir = self.harmonic_embedding_dir(directions)    #4.1    torch.Size([1024, 15])
        # import ipdb; ipdb.set_trace()
        embedded_dir = embedded_dir.unsqueeze(1).repeat(1, 128, 1).reshape((-1,15))
        x = torch.cat((embedded_xyz, embedded_dir), dim=-1)       #4.1
        # print(f"x_shape: {x.shape}")


        x = self.in_layer(x)                               # torch.Size([131072, 128])
        # print(f"x_shape_after 1 pass: {x.shape}")                               

        for _ in range(self.cfg.n_layers_xyz-2):
            x = self.relu(self.hidden(x))

        density = self.relu(self.out_density(x))
        color = self.sigmoid(self.out_color(x))

        out = {
            'density': density,
            'feature': color,
        }

        return out

Visualization

part_4_1.gif

With added view dependence (direction of the rays), the results improved slightly visually and also the loss got improved with same training parameters and epochs. View dependence can result overfitting to certain views in the training data. However, for a volumetric rendering task, the predicted density is not view-dependent. We can therefore use seperated prediction heads for predicting density and color. For density the input includes view dependence and position of ray, and for color, the input includes only the position of ray.