We advance the state of the art in polyphonic piano music transcription by
using a deep convolutional and recurrent neural network which is trained to
jointly predict onsets and frames. Our model predicts pitch onset events and
then uses those predictions to condition framewise pitch predictions. During
inference, we restrict the predictions from the framewise detector by not
allowing a new note to start unless the onset detector also agrees that an
onset for that pitch is present in the frame. We focus on improving onsets and
offsets together instead of either in isolation as we believe this correlates
better with human musical perception. Our approach results in over a 100%
relative improvement in note F1 score (with offsets) on the MAPS dataset.
Furthermore, we extend the model to predict relative velocities of normalized
audio which results in more natural-sounding transcriptions.
Description
[1710.11153] Onsets and Frames: Dual-Objective Piano Transcription
%0 Generic
%1 hawthorne2017onsets
%A Hawthorne, Curtis
%A Elsen, Erich
%A Song, Jialin
%A Roberts, Adam
%A Simon, Ian
%A Raffel, Colin
%A Engel, Jesse
%A Oore, Sageev
%A Eck, Douglas
%D 2017
%K representation scorebased transcription
%T Onsets and Frames: Dual-Objective Piano Transcription
%U http://arxiv.org/abs/1710.11153
%X We advance the state of the art in polyphonic piano music transcription by
using a deep convolutional and recurrent neural network which is trained to
jointly predict onsets and frames. Our model predicts pitch onset events and
then uses those predictions to condition framewise pitch predictions. During
inference, we restrict the predictions from the framewise detector by not
allowing a new note to start unless the onset detector also agrees that an
onset for that pitch is present in the frame. We focus on improving onsets and
offsets together instead of either in isolation as we believe this correlates
better with human musical perception. Our approach results in over a 100%
relative improvement in note F1 score (with offsets) on the MAPS dataset.
Furthermore, we extend the model to predict relative velocities of normalized
audio which results in more natural-sounding transcriptions.
@misc{hawthorne2017onsets,
abstract = {We advance the state of the art in polyphonic piano music transcription by
using a deep convolutional and recurrent neural network which is trained to
jointly predict onsets and frames. Our model predicts pitch onset events and
then uses those predictions to condition framewise pitch predictions. During
inference, we restrict the predictions from the framewise detector by not
allowing a new note to start unless the onset detector also agrees that an
onset for that pitch is present in the frame. We focus on improving onsets and
offsets together instead of either in isolation as we believe this correlates
better with human musical perception. Our approach results in over a 100%
relative improvement in note F1 score (with offsets) on the MAPS dataset.
Furthermore, we extend the model to predict relative velocities of normalized
audio which results in more natural-sounding transcriptions.},
added-at = {2023-04-04T17:02:08.000+0200},
author = {Hawthorne, Curtis and Elsen, Erich and Song, Jialin and Roberts, Adam and Simon, Ian and Raffel, Colin and Engel, Jesse and Oore, Sageev and Eck, Douglas},
biburl = {https://www.bibsonomy.org/bibtex/205f3827c48a362eeb90f61ff227d9718/alex_h},
description = {[1710.11153] Onsets and Frames: Dual-Objective Piano Transcription},
interhash = {20cb11023d5054e4b8923f2410e3c739},
intrahash = {05f3827c48a362eeb90f61ff227d9718},
keywords = {representation scorebased transcription},
note = {cite arxiv:1710.11153Comment: Examples available at https://goo.gl/magenta/onsets-frames-examples},
timestamp = {2023-04-04T17:02:08.000+0200},
title = {Onsets and Frames: Dual-Objective Piano Transcription},
url = {http://arxiv.org/abs/1710.11153},
year = 2017
}