Attention-based Transformer models have been increasingly employed for automatic music generation. To condition the generation process of such a model with a user-specified sequence, a popular approach is to take that conditioning sequence as a priming sequence and ask a Transformer decoder to generate a continuation. However, this <i>prompt-based</i> conditioning cannot guarantee that the conditioning sequence would develop or even simply repeat itself in the generated continuation. In this paper, we propose an alternative conditioning approach, called <i>theme-based</i> conditioning, that explicitly trains the Transformer to treat the conditioning sequence as a thematic material that has to manifest itself multiple times in its generation result. This is achieved with two main technical contributions. First, we propose a deep learning-based approach that uses contrastive representation learning and clustering to automatically retrieve thematic materials from music pieces in the training data. Second, we propose a novel gated parallel attention module to be used in a sequence-to-sequence (seq2seq) encoder/decoder architecture to more effectively account for a given conditioning thematic material in the generation process of the Transformer decoder. We report on objective and subjective evaluations of variants of the proposed Theme Transformer and the conventional prompt-based baseline, showing that our best model can generate, to some extent, polyphonic pop piano music with repetition and plausible variations of a given condition.
Описание
Theme Transformer: Symbolic Music Generation with Theme-Conditioned Transformer | IEEE Journals & Magazine | IEEE Xplore
%0 Journal Article
%1 9740506
%A Shih, Yi-Jen
%A Wu, Shih-Lun
%A Zalkow, Frank
%A Muller, Meinard
%A Yang, Yi-Hsuan
%D 2022
%J IEEE Transactions on Multimedia
%K generation theme-based
%P 1-1
%R 10.1109/TMM.2022.3161851
%T Theme Transformer: Symbolic Music Generation with Theme-Conditioned Transformer
%U https://ieeexplore.ieee.org/document/9740506
%X Attention-based Transformer models have been increasingly employed for automatic music generation. To condition the generation process of such a model with a user-specified sequence, a popular approach is to take that conditioning sequence as a priming sequence and ask a Transformer decoder to generate a continuation. However, this <i>prompt-based</i> conditioning cannot guarantee that the conditioning sequence would develop or even simply repeat itself in the generated continuation. In this paper, we propose an alternative conditioning approach, called <i>theme-based</i> conditioning, that explicitly trains the Transformer to treat the conditioning sequence as a thematic material that has to manifest itself multiple times in its generation result. This is achieved with two main technical contributions. First, we propose a deep learning-based approach that uses contrastive representation learning and clustering to automatically retrieve thematic materials from music pieces in the training data. Second, we propose a novel gated parallel attention module to be used in a sequence-to-sequence (seq2seq) encoder/decoder architecture to more effectively account for a given conditioning thematic material in the generation process of the Transformer decoder. We report on objective and subjective evaluations of variants of the proposed Theme Transformer and the conventional prompt-based baseline, showing that our best model can generate, to some extent, polyphonic pop piano music with repetition and plausible variations of a given condition.
@article{9740506,
abstract = {Attention-based Transformer models have been increasingly employed for automatic music generation. To condition the generation process of such a model with a user-specified sequence, a popular approach is to take that conditioning sequence as a priming sequence and ask a Transformer decoder to generate a continuation. However, this <i>prompt-based</i> conditioning cannot guarantee that the conditioning sequence would develop or even simply repeat itself in the generated continuation. In this paper, we propose an alternative conditioning approach, called <i>theme-based</i> conditioning, that explicitly trains the Transformer to treat the conditioning sequence as a thematic material that has to manifest itself multiple times in its generation result. This is achieved with two main technical contributions. First, we propose a deep learning-based approach that uses contrastive representation learning and clustering to automatically retrieve thematic materials from music pieces in the training data. Second, we propose a novel gated parallel attention module to be used in a sequence-to-sequence (seq2seq) encoder/decoder architecture to more effectively account for a given conditioning thematic material in the generation process of the Transformer decoder. We report on objective and subjective evaluations of variants of the proposed Theme Transformer and the conventional prompt-based baseline, showing that our best model can generate, to some extent, polyphonic pop piano music with repetition and plausible variations of a given condition.},
added-at = {2023-04-17T14:59:30.000+0200},
author = {Shih, Yi-Jen and Wu, Shih-Lun and Zalkow, Frank and Muller, Meinard and Yang, Yi-Hsuan},
biburl = {https://www.bibsonomy.org/bibtex/2985e91854767679de59ef77f6630cec5/alex_h},
description = {Theme Transformer: Symbolic Music Generation with Theme-Conditioned Transformer | IEEE Journals & Magazine | IEEE Xplore},
doi = {10.1109/TMM.2022.3161851},
interhash = {8cd85857711e6288e13dd6099a6accd7},
intrahash = {985e91854767679de59ef77f6630cec5},
issn = {1941-0077},
journal = {IEEE Transactions on Multimedia},
keywords = {generation theme-based},
pages = {1-1},
timestamp = {2023-04-17T14:59:30.000+0200},
title = {Theme Transformer: Symbolic Music Generation with Theme-Conditioned Transformer},
url = {https://ieeexplore.ieee.org/document/9740506},
year = 2022
}