Audio pattern recognition is an important research topic in the machine
learning area, and includes several tasks such as audio tagging, acoustic scene
classification, music classification, speech emotion classification and sound
event detection. Recently, neural networks have been applied to tackle audio
pattern recognition problems. However, previous systems are built on specific
datasets with limited durations. Recently, in computer vision and natural
language processing, systems pretrained on large-scale datasets have
generalized well to several tasks. However, there is limited research on
pretraining systems on large-scale datasets for audio pattern recognition. In
this paper, we propose pretrained audio neural networks (PANNs) trained on the
large-scale AudioSet dataset. These PANNs are transferred to other audio
related tasks. We investigate the performance and computational complexity of
PANNs modeled by a variety of convolutional neural networks. We propose an
architecture called Wavegram-Logmel-CNN using both log-mel spectrogram and
waveform as input feature. Our best PANN system achieves a state-of-the-art
mean average precision (mAP) of 0.439 on AudioSet tagging, outperforming the
best previous system of 0.392. We transfer PANNs to six audio pattern
recognition tasks, and demonstrate state-of-the-art performance in several of
those tasks. We have released the source code and pretrained models of PANNs:
https://github.com/qiuqiangkong/audioset_tagging_cnn.
%0 Generic
%1 kong2019panns
%A Kong, Qiuqiang
%A Cao, Yin
%A Iqbal, Turab
%A Wang, Yuxuan
%A Wang, Wenwu
%A Plumbley, Mark D.
%D 2019
%K from:lukasbarth imported ma_ss22_ts
%T PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern
Recognition
%U http://arxiv.org/abs/1912.10211
%X Audio pattern recognition is an important research topic in the machine
learning area, and includes several tasks such as audio tagging, acoustic scene
classification, music classification, speech emotion classification and sound
event detection. Recently, neural networks have been applied to tackle audio
pattern recognition problems. However, previous systems are built on specific
datasets with limited durations. Recently, in computer vision and natural
language processing, systems pretrained on large-scale datasets have
generalized well to several tasks. However, there is limited research on
pretraining systems on large-scale datasets for audio pattern recognition. In
this paper, we propose pretrained audio neural networks (PANNs) trained on the
large-scale AudioSet dataset. These PANNs are transferred to other audio
related tasks. We investigate the performance and computational complexity of
PANNs modeled by a variety of convolutional neural networks. We propose an
architecture called Wavegram-Logmel-CNN using both log-mel spectrogram and
waveform as input feature. Our best PANN system achieves a state-of-the-art
mean average precision (mAP) of 0.439 on AudioSet tagging, outperforming the
best previous system of 0.392. We transfer PANNs to six audio pattern
recognition tasks, and demonstrate state-of-the-art performance in several of
those tasks. We have released the source code and pretrained models of PANNs:
https://github.com/qiuqiangkong/audioset_tagging_cnn.
@misc{kong2019panns,
abstract = {Audio pattern recognition is an important research topic in the machine
learning area, and includes several tasks such as audio tagging, acoustic scene
classification, music classification, speech emotion classification and sound
event detection. Recently, neural networks have been applied to tackle audio
pattern recognition problems. However, previous systems are built on specific
datasets with limited durations. Recently, in computer vision and natural
language processing, systems pretrained on large-scale datasets have
generalized well to several tasks. However, there is limited research on
pretraining systems on large-scale datasets for audio pattern recognition. In
this paper, we propose pretrained audio neural networks (PANNs) trained on the
large-scale AudioSet dataset. These PANNs are transferred to other audio
related tasks. We investigate the performance and computational complexity of
PANNs modeled by a variety of convolutional neural networks. We propose an
architecture called Wavegram-Logmel-CNN using both log-mel spectrogram and
waveform as input feature. Our best PANN system achieves a state-of-the-art
mean average precision (mAP) of 0.439 on AudioSet tagging, outperforming the
best previous system of 0.392. We transfer PANNs to six audio pattern
recognition tasks, and demonstrate state-of-the-art performance in several of
those tasks. We have released the source code and pretrained models of PANNs:
https://github.com/qiuqiangkong/audioset_tagging_cnn.},
added-at = {2022-07-05T18:41:46.000+0200},
author = {Kong, Qiuqiang and Cao, Yin and Iqbal, Turab and Wang, Yuxuan and Wang, Wenwu and Plumbley, Mark D.},
biburl = {https://www.bibsonomy.org/bibtex/2d464be705d72bb4152e15c80b4fcc227/lukasbarth},
description = {1912.10211v5.pdf},
interhash = {8de1c7f0ee2386a126f2e599a22b72cb},
intrahash = {d464be705d72bb4152e15c80b4fcc227},
keywords = {from:lukasbarth imported ma_ss22_ts},
note = {cite arxiv:1912.10211Comment: 14 pages},
timestamp = {2022-07-05T18:41:46.000+0200},
title = {PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern
Recognition},
url = {http://arxiv.org/abs/1912.10211},
year = 2019
}