We propose a Text-to-Speech method to create an unseen expressive style using
one utterance of expressive speech of around one second. Specifically, we
enhance the disentanglement capabilities of a state-of-the-art
sequence-to-sequence based system with a Variational AutoEncoder (VAE) and a
Householder Flow. The proposed system provides a 22% KL-divergence reduction
while jointly improving perceptual metrics over state-of-the-art. At synthesis
time we use one example of expressive style as a reference input to the encoder
for generating any text in the desired style. Perceptual MUSHRA evaluations
show that we can create a voice with a 9% relative naturalness improvement over
standard Neural Text-to-Speech, while also improving the perceived emotional
intensity (59 compared to the 55 of neutral speech).
Beschreibung
Using VAEs and Normalizing Flows for One-shot Text-To-Speech Synthesis of Expressive Speech
%0 Generic
%1 aggarwal2019using
%A Aggarwal, Vatsal
%A Cotescu, Marius
%A Prateek, Nishant
%A Lorenzo-Trueba, Jaime
%A Barra-Chicote, Roberto
%D 2019
%K disentanglement emotions myown tts
%T Using VAEs and Normalizing Flows for One-shot Text-To-Speech Synthesis
of Expressive Speech
%U http://arxiv.org/abs/1911.12760
%X We propose a Text-to-Speech method to create an unseen expressive style using
one utterance of expressive speech of around one second. Specifically, we
enhance the disentanglement capabilities of a state-of-the-art
sequence-to-sequence based system with a Variational AutoEncoder (VAE) and a
Householder Flow. The proposed system provides a 22% KL-divergence reduction
while jointly improving perceptual metrics over state-of-the-art. At synthesis
time we use one example of expressive style as a reference input to the encoder
for generating any text in the desired style. Perceptual MUSHRA evaluations
show that we can create a voice with a 9% relative naturalness improvement over
standard Neural Text-to-Speech, while also improving the perceived emotional
intensity (59 compared to the 55 of neutral speech).
@misc{aggarwal2019using,
abstract = {We propose a Text-to-Speech method to create an unseen expressive style using
one utterance of expressive speech of around one second. Specifically, we
enhance the disentanglement capabilities of a state-of-the-art
sequence-to-sequence based system with a Variational AutoEncoder (VAE) and a
Householder Flow. The proposed system provides a 22% KL-divergence reduction
while jointly improving perceptual metrics over state-of-the-art. At synthesis
time we use one example of expressive style as a reference input to the encoder
for generating any text in the desired style. Perceptual MUSHRA evaluations
show that we can create a voice with a 9% relative naturalness improvement over
standard Neural Text-to-Speech, while also improving the perceived emotional
intensity (59 compared to the 55 of neutral speech).},
added-at = {2020-11-19T14:24:38.000+0100},
author = {Aggarwal, Vatsal and Cotescu, Marius and Prateek, Nishant and Lorenzo-Trueba, Jaime and Barra-Chicote, Roberto},
biburl = {https://www.bibsonomy.org/bibtex/2e6cc13458983677829f08f992415a6ff/marius.cotescu},
description = {Using VAEs and Normalizing Flows for One-shot Text-To-Speech Synthesis of Expressive Speech},
interhash = {9a46dea1ea609ec0d3bcf1ab60cd9c03},
intrahash = {e6cc13458983677829f08f992415a6ff},
keywords = {disentanglement emotions myown tts},
note = {cite arxiv:1911.12760Comment: Accepted to ICASSP 2020},
timestamp = {2020-11-19T14:24:38.000+0100},
title = {Using VAEs and Normalizing Flows for One-shot Text-To-Speech Synthesis
of Expressive Speech},
url = {http://arxiv.org/abs/1911.12760},
year = 2019
}