Per-pixel ground-truth depth data is challenging to acquire at scale. To overcome this limitation, self-supervised learning has emerged as a promising alternative for training models to perform monocular depth estimation. In this paper, we propose a set of improvements, which together result in both quantitatively and qualitatively improved depth maps compared to competing self-supervised methods. Research on self-supervised monocular training usually explores increasingly complex architectures, loss functions, and image formation models, all of which have recently helped to close the gap with fully-supervised methods. We show that a surprisingly simple model, and associated design choices, lead to superior predictions. In particular, we propose (i) a minimum reprojection loss, designed to robustly handle occlusions, (ii) a full-resolution multi-scale sampling method that reduces visual artifacts, and (iii) an auto-masking loss to ignore training pixels that violate camera motion assumptions. We demonstrate the effectiveness of each component in isolation, and show high quality, state-of-the-art results on the KITTI benchmark.
%0 Conference Paper
%1 2019-godard
%A Godard, Clement
%A Aodha, Oisin Mac
%A Firman, Michael
%A Brostow, Gabriel
%B 2019 IEEE/CVF International Conference on Computer Vision (ICCV)
%D 2019
%K depth monocular monodepth2 niantic self-supervised
%P 3827-3837
%R 10.1109/ICCV.2019.00393
%T Digging Into Self-Supervised Monocular Depth Estimation
%U https://ieeexplore.ieee.org/document/9009796/
%X Per-pixel ground-truth depth data is challenging to acquire at scale. To overcome this limitation, self-supervised learning has emerged as a promising alternative for training models to perform monocular depth estimation. In this paper, we propose a set of improvements, which together result in both quantitatively and qualitatively improved depth maps compared to competing self-supervised methods. Research on self-supervised monocular training usually explores increasingly complex architectures, loss functions, and image formation models, all of which have recently helped to close the gap with fully-supervised methods. We show that a surprisingly simple model, and associated design choices, lead to superior predictions. In particular, we propose (i) a minimum reprojection loss, designed to robustly handle occlusions, (ii) a full-resolution multi-scale sampling method that reduces visual artifacts, and (iii) an auto-masking loss to ignore training pixels that violate camera motion assumptions. We demonstrate the effectiveness of each component in isolation, and show high quality, state-of-the-art results on the KITTI benchmark.
@inproceedings{2019-godard,
abstract = {Per-pixel ground-truth depth data is challenging to acquire at scale. To overcome this limitation, self-supervised learning has emerged as a promising alternative for training models to perform monocular depth estimation. In this paper, we propose a set of improvements, which together result in both quantitatively and qualitatively improved depth maps compared to competing self-supervised methods. Research on self-supervised monocular training usually explores increasingly complex architectures, loss functions, and image formation models, all of which have recently helped to close the gap with fully-supervised methods. We show that a surprisingly simple model, and associated design choices, lead to superior predictions. In particular, we propose (i) a minimum reprojection loss, designed to robustly handle occlusions, (ii) a full-resolution multi-scale sampling method that reduces visual artifacts, and (iii) an auto-masking loss to ignore training pixels that violate camera motion assumptions. We demonstrate the effectiveness of each component in isolation, and show high quality, state-of-the-art results on the KITTI benchmark.},
added-at = {2021-07-05T22:33:18.000+0200},
author = {Godard, Clement and Aodha, Oisin Mac and Firman, Michael and Brostow, Gabriel},
biburl = {https://www.bibsonomy.org/bibtex/24bde975f5bcce1b4a04282fa59e2a4d4/pkoch},
booktitle = {2019 IEEE/CVF International Conference on Computer Vision (ICCV)},
doi = {10.1109/ICCV.2019.00393},
interhash = {6b006f54c34fc4aa8bfe493e0db6212a},
intrahash = {4bde975f5bcce1b4a04282fa59e2a4d4},
issn = {2380-7504},
keywords = {depth monocular monodepth2 niantic self-supervised},
month = oct,
pages = {3827-3837},
timestamp = {2021-07-05T22:33:18.000+0200},
title = {Digging Into Self-Supervised Monocular Depth Estimation},
url = {https://ieeexplore.ieee.org/document/9009796/},
year = 2019
}