Learning general image representations has proven key to the success of many
computer vision tasks. For example, many approaches to image understanding
problems rely on deep networks that were initially trained on ImageNet, mostly
because the learned features are a valuable starting point to learn from
limited labeled data. However, when it comes to 3D motion capture of multiple
people, these features are only of limited use.
In this paper, we therefore propose an approach to learning features that are
useful for this purpose. To this end, we introduce a self-supervised approach
to learning what we call a neural scene decomposition (NSD) that can be
exploited for 3D pose estimation. NSD comprises three layers of abstraction to
represent human subjects: spatial layout in terms of bounding-boxes and
relative depth; a 2D shape representation in terms of an instance segmentation
mask; and subject-specific appearance and 3D pose information. By exploiting
self-supervision coming from multiview data, our NSD model can be trained
end-to-end without any 2D or 3D supervision. In contrast to previous
approaches, it works for multiple persons and full-frame images. Because it
encodes 3D geometry, NSD can then be effectively leveraged to train a 3D pose
estimation network from small amounts of annotated data.
Description
[1903.05684] Neural Scene Decomposition for Multi-Person Motion Capture
%0 Generic
%1 rhodin2019neural
%A Rhodin, Helge
%A Constantin, Victor
%A Katircioglu, Isinsu
%A Salzmann, Mathieu
%A Fua, Pascal
%D 2019
%K 2019 motion-capture
%T Neural Scene Decomposition for Multi-Person Motion Capture
%U http://arxiv.org/abs/1903.05684
%X Learning general image representations has proven key to the success of many
computer vision tasks. For example, many approaches to image understanding
problems rely on deep networks that were initially trained on ImageNet, mostly
because the learned features are a valuable starting point to learn from
limited labeled data. However, when it comes to 3D motion capture of multiple
people, these features are only of limited use.
In this paper, we therefore propose an approach to learning features that are
useful for this purpose. To this end, we introduce a self-supervised approach
to learning what we call a neural scene decomposition (NSD) that can be
exploited for 3D pose estimation. NSD comprises three layers of abstraction to
represent human subjects: spatial layout in terms of bounding-boxes and
relative depth; a 2D shape representation in terms of an instance segmentation
mask; and subject-specific appearance and 3D pose information. By exploiting
self-supervision coming from multiview data, our NSD model can be trained
end-to-end without any 2D or 3D supervision. In contrast to previous
approaches, it works for multiple persons and full-frame images. Because it
encodes 3D geometry, NSD can then be effectively leveraged to train a 3D pose
estimation network from small amounts of annotated data.
@misc{rhodin2019neural,
abstract = {Learning general image representations has proven key to the success of many
computer vision tasks. For example, many approaches to image understanding
problems rely on deep networks that were initially trained on ImageNet, mostly
because the learned features are a valuable starting point to learn from
limited labeled data. However, when it comes to 3D motion capture of multiple
people, these features are only of limited use.
In this paper, we therefore propose an approach to learning features that are
useful for this purpose. To this end, we introduce a self-supervised approach
to learning what we call a neural scene decomposition (NSD) that can be
exploited for 3D pose estimation. NSD comprises three layers of abstraction to
represent human subjects: spatial layout in terms of bounding-boxes and
relative depth; a 2D shape representation in terms of an instance segmentation
mask; and subject-specific appearance and 3D pose information. By exploiting
self-supervision coming from multiview data, our NSD model can be trained
end-to-end without any 2D or 3D supervision. In contrast to previous
approaches, it works for multiple persons and full-frame images. Because it
encodes 3D geometry, NSD can then be effectively leveraged to train a 3D pose
estimation network from small amounts of annotated data.},
added-at = {2019-09-11T17:04:35.000+0200},
author = {Rhodin, Helge and Constantin, Victor and Katircioglu, Isinsu and Salzmann, Mathieu and Fua, Pascal},
biburl = {https://www.bibsonomy.org/bibtex/28dc5f572bf0a16df4ba87a2f8650425e/analyst},
description = {[1903.05684] Neural Scene Decomposition for Multi-Person Motion Capture},
interhash = {3d5145e78aa7dbccc756f8c9359a9dda},
intrahash = {8dc5f572bf0a16df4ba87a2f8650425e},
keywords = {2019 motion-capture},
note = {cite arxiv:1903.05684Comment: CVPR 2019},
timestamp = {2019-09-11T17:04:35.000+0200},
title = {Neural Scene Decomposition for Multi-Person Motion Capture},
url = {http://arxiv.org/abs/1903.05684},
year = 2019
}