Our goal is to learn a deep network that, given a small number of images of
an object of a given category, reconstructs it in 3D. While several recent
works have obtained analogous results using synthetic data or assuming the
availability of 2D primitives such as keypoints, we are interested in working
with challenging real data and with no manual annotations. We thus focus on
learning a model from multiple views of a large collection of object instances.
We contribute with a new large dataset of object centric videos suitable for
training and benchmarking this class of models. We show that existing
techniques leveraging meshes, voxels, or implicit surfaces, which work well for
reconstructing isolated objects, fail on this challenging data. Finally, we
propose a new neural network design, called warp-conditioned ray embedding
(WCR), which significantly improves reconstruction while obtaining a detailed
implicit representation of the object surface and texture, also compensating
for the noise in the initial SfM reconstruction that bootstrapped the learning
process. Our evaluation demonstrates performance improvements over several deep
monocular reconstruction baselines on existing benchmarks and on our novel
dataset.
%0 Generic
%1 henzler2021unsupervised
%A Henzler, Philipp
%A Reizenstein, Jeremy
%A Labatut, Patrick
%A Shapovalov, Roman
%A Ritschel, Tobias
%A Vedaldi, Andrea
%A Novotny, David
%D 2021
%K 3d_reconstruction cvpr21 dataset deeplearning neural_reconstruction neural_rendering object
%T Unsupervised Learning of 3D Object Categories from Videos in the Wild
%U http://arxiv.org/abs/2103.16552
%X Our goal is to learn a deep network that, given a small number of images of
an object of a given category, reconstructs it in 3D. While several recent
works have obtained analogous results using synthetic data or assuming the
availability of 2D primitives such as keypoints, we are interested in working
with challenging real data and with no manual annotations. We thus focus on
learning a model from multiple views of a large collection of object instances.
We contribute with a new large dataset of object centric videos suitable for
training and benchmarking this class of models. We show that existing
techniques leveraging meshes, voxels, or implicit surfaces, which work well for
reconstructing isolated objects, fail on this challenging data. Finally, we
propose a new neural network design, called warp-conditioned ray embedding
(WCR), which significantly improves reconstruction while obtaining a detailed
implicit representation of the object surface and texture, also compensating
for the noise in the initial SfM reconstruction that bootstrapped the learning
process. Our evaluation demonstrates performance improvements over several deep
monocular reconstruction baselines on existing benchmarks and on our novel
dataset.
@misc{henzler2021unsupervised,
abstract = {Our goal is to learn a deep network that, given a small number of images of
an object of a given category, reconstructs it in 3D. While several recent
works have obtained analogous results using synthetic data or assuming the
availability of 2D primitives such as keypoints, we are interested in working
with challenging real data and with no manual annotations. We thus focus on
learning a model from multiple views of a large collection of object instances.
We contribute with a new large dataset of object centric videos suitable for
training and benchmarking this class of models. We show that existing
techniques leveraging meshes, voxels, or implicit surfaces, which work well for
reconstructing isolated objects, fail on this challenging data. Finally, we
propose a new neural network design, called warp-conditioned ray embedding
(WCR), which significantly improves reconstruction while obtaining a detailed
implicit representation of the object surface and texture, also compensating
for the noise in the initial SfM reconstruction that bootstrapped the learning
process. Our evaluation demonstrates performance improvements over several deep
monocular reconstruction baselines on existing benchmarks and on our novel
dataset.},
added-at = {2021-11-24T14:11:12.000+0100},
author = {Henzler, Philipp and Reizenstein, Jeremy and Labatut, Patrick and Shapovalov, Roman and Ritschel, Tobias and Vedaldi, Andrea and Novotny, David},
biburl = {https://www.bibsonomy.org/bibtex/2b821016b5e6c15970576525a5c293cb8/shuncheng.wu},
description = {2103.16552.pdf},
interhash = {c754862a56ebe9cef3c58f9db73b55ea},
intrahash = {b821016b5e6c15970576525a5c293cb8},
keywords = {3d_reconstruction cvpr21 dataset deeplearning neural_reconstruction neural_rendering object},
note = {cite arxiv:2103.16552},
timestamp = {2021-11-24T14:11:12.000+0100},
title = {Unsupervised Learning of 3D Object Categories from Videos in the Wild},
url = {http://arxiv.org/abs/2103.16552},
year = 2021
}