Voice profiling aims at inferring various human parameters from their speech,
e.g. gender, age, etc. In this paper, we address the challenge posed by a
subtask of voice profiling - reconstructing someone's face from their voice.
The task is designed to answer the question: given an audio clip spoken by an
unseen person, can we picture a face that has as many common elements, or
associations as possible with the speaker, in terms of identity? To address
this problem, we propose a simple but effective computational framework based
on generative adversarial networks (GANs). The network learns to generate faces
from voices by matching the identities of generated faces to those of the
speakers, on a training set. We evaluate the performance of the network by
leveraging a closely related task - cross-modal matching. The results show that
our model is able to generate faces that match several biometric
characteristics of the speaker, and results in matching accuracies that are
much better than chance.
%0 Generic
%1 wen2019reconstructing
%A Wen, Yandong
%A Singh, Rita
%A Raj, Bhiksha
%D 2019
%K gan
%T Reconstructing faces from voices
%U http://arxiv.org/abs/1905.10604
%X Voice profiling aims at inferring various human parameters from their speech,
e.g. gender, age, etc. In this paper, we address the challenge posed by a
subtask of voice profiling - reconstructing someone's face from their voice.
The task is designed to answer the question: given an audio clip spoken by an
unseen person, can we picture a face that has as many common elements, or
associations as possible with the speaker, in terms of identity? To address
this problem, we propose a simple but effective computational framework based
on generative adversarial networks (GANs). The network learns to generate faces
from voices by matching the identities of generated faces to those of the
speakers, on a training set. We evaluate the performance of the network by
leveraging a closely related task - cross-modal matching. The results show that
our model is able to generate faces that match several biometric
characteristics of the speaker, and results in matching accuracies that are
much better than chance.
@misc{wen2019reconstructing,
abstract = {Voice profiling aims at inferring various human parameters from their speech,
e.g. gender, age, etc. In this paper, we address the challenge posed by a
subtask of voice profiling - reconstructing someone's face from their voice.
The task is designed to answer the question: given an audio clip spoken by an
unseen person, can we picture a face that has as many common elements, or
associations as possible with the speaker, in terms of identity? To address
this problem, we propose a simple but effective computational framework based
on generative adversarial networks (GANs). The network learns to generate faces
from voices by matching the identities of generated faces to those of the
speakers, on a training set. We evaluate the performance of the network by
leveraging a closely related task - cross-modal matching. The results show that
our model is able to generate faces that match several biometric
characteristics of the speaker, and results in matching accuracies that are
much better than chance.},
added-at = {2020-11-12T07:25:48.000+0100},
author = {Wen, Yandong and Singh, Rita and Raj, Bhiksha},
biburl = {https://www.bibsonomy.org/bibtex/2900d683ff088e4a334678b6215411b6b/jgeofil},
description = {Reconstructing faces from voices},
interhash = {80869f95b49c7e38c84830c54e2fe966},
intrahash = {900d683ff088e4a334678b6215411b6b},
keywords = {gan},
note = {cite arxiv:1905.10604},
timestamp = {2020-11-12T07:25:48.000+0100},
title = {Reconstructing faces from voices},
url = {http://arxiv.org/abs/1905.10604},
year = 2019
}