We present an end-to-end 3D reconstruction method for a scene by directly
regressing a truncated signed distance function (TSDF) from a set of posed RGB
images. Traditional approaches to 3D reconstruction rely on an intermediate
representation of depth maps prior to estimating a full 3D model of a scene. We
hypothesize that a direct regression to 3D is more effective. A 2D CNN extracts
features from each image independently which are then back-projected and
accumulated into a voxel volume using the camera intrinsics and extrinsics.
After accumulation, a 3D CNN refines the accumulated features and predicts the
TSDF values. Additionally, semantic segmentation of the 3D model is obtained
without significant computation. This approach is evaluated on the Scannet
dataset where we significantly outperform state-of-the-art baselines (deep
multiview stereo followed by traditional TSDF fusion) both quantitatively and
qualitatively. We compare our 3D semantic segmentation to prior methods that
use a depth sensor since no previous work attempts the problem with only RGB
input.
%0 Generic
%1 murez2020atlas
%A Murez, Zak
%A van As, Tarrence
%A Bartolozzi, James
%A Sinha, Ayan
%A Badrinarayanan, Vijay
%A Rabinovich, Andrew
%D 2020
%K 3d_reconstruction deeplearning eccv20 neural_reconstruction
%T Atlas: End-to-End 3D Scene Reconstruction from Posed Images
%U http://arxiv.org/abs/2003.10432
%X We present an end-to-end 3D reconstruction method for a scene by directly
regressing a truncated signed distance function (TSDF) from a set of posed RGB
images. Traditional approaches to 3D reconstruction rely on an intermediate
representation of depth maps prior to estimating a full 3D model of a scene. We
hypothesize that a direct regression to 3D is more effective. A 2D CNN extracts
features from each image independently which are then back-projected and
accumulated into a voxel volume using the camera intrinsics and extrinsics.
After accumulation, a 3D CNN refines the accumulated features and predicts the
TSDF values. Additionally, semantic segmentation of the 3D model is obtained
without significant computation. This approach is evaluated on the Scannet
dataset where we significantly outperform state-of-the-art baselines (deep
multiview stereo followed by traditional TSDF fusion) both quantitatively and
qualitatively. We compare our 3D semantic segmentation to prior methods that
use a depth sensor since no previous work attempts the problem with only RGB
input.
@misc{murez2020atlas,
abstract = {We present an end-to-end 3D reconstruction method for a scene by directly
regressing a truncated signed distance function (TSDF) from a set of posed RGB
images. Traditional approaches to 3D reconstruction rely on an intermediate
representation of depth maps prior to estimating a full 3D model of a scene. We
hypothesize that a direct regression to 3D is more effective. A 2D CNN extracts
features from each image independently which are then back-projected and
accumulated into a voxel volume using the camera intrinsics and extrinsics.
After accumulation, a 3D CNN refines the accumulated features and predicts the
TSDF values. Additionally, semantic segmentation of the 3D model is obtained
without significant computation. This approach is evaluated on the Scannet
dataset where we significantly outperform state-of-the-art baselines (deep
multiview stereo followed by traditional TSDF fusion) both quantitatively and
qualitatively. We compare our 3D semantic segmentation to prior methods that
use a depth sensor since no previous work attempts the problem with only RGB
input.},
added-at = {2021-06-26T11:04:17.000+0200},
author = {Murez, Zak and van As, Tarrence and Bartolozzi, James and Sinha, Ayan and Badrinarayanan, Vijay and Rabinovich, Andrew},
biburl = {https://www.bibsonomy.org/bibtex/2d0e0082f24e02e4a5cbfc1153c0e1442/shuncheng.wu},
description = {2003.10432.pdf},
interhash = {cf35e36fa771e06e8f50ae7e00f34318},
intrahash = {d0e0082f24e02e4a5cbfc1153c0e1442},
keywords = {3d_reconstruction deeplearning eccv20 neural_reconstruction},
note = {cite arxiv:2003.10432},
timestamp = {2021-06-26T11:04:17.000+0200},
title = {Atlas: End-to-End 3D Scene Reconstruction from Posed Images},
url = {http://arxiv.org/abs/2003.10432},
year = 2020
}