The performance of deep network learning strongly depends on the choice of the non-linear activation function associated with each neuron. However, deciding on the best activation is non-trivial and the choice depends on the architecture, hyper-parameters, and even on the dataset. Typically these activations are fixed by hand before training. Here, we demonstrate how to eliminate the reliance on first picking fixed activation functions by using flexible parametric rational functions instead. The resulting Padé Activation Units (PAUs) can both approximate common activation functions and also learn new ones while providing compact representations. Our empirical evidence shows that end-to-end learning deep networks with PAUs can increase the predictive performance and reduce the training time of common deep architectures. Moreover, PAUs pave the way to approximations with provable robustness. The source code can be found at https://github.com/ml-research/pau
Paper presents rational functions for training activation functions.
Instead of standard Pade approximants, safe Pade approximants are used. This makes sure that the function does not have a pole.
The parameters of the SPA are trained with the network. But the orders of nominator and denominator have to be fixed beforehand and the parameters are initialized to match ReLU variants.
Personal Note: initializing the SPAs to match other activation functions makes me wonder, whether this takes biases the SPA towards the chosen activation function. Unfortunately, the authors do not provide experiments with randomly initialized SPA parameters.
%0 Journal Article
%1 Molina2019
%A Molina, Alejandro
%A Schramowski, Patrick
%A Kersting, Kristian
%D 2019
%K Activation Units cs.LG, cs.NE,
%T Padé Activation Units: End-to-end Learning of Flexible Activation Functions in Deep Networks
%X The performance of deep network learning strongly depends on the choice of the non-linear activation function associated with each neuron. However, deciding on the best activation is non-trivial and the choice depends on the architecture, hyper-parameters, and even on the dataset. Typically these activations are fixed by hand before training. Here, we demonstrate how to eliminate the reliance on first picking fixed activation functions by using flexible parametric rational functions instead. The resulting Padé Activation Units (PAUs) can both approximate common activation functions and also learn new ones while providing compact representations. Our empirical evidence shows that end-to-end learning deep networks with PAUs can increase the predictive performance and reduce the training time of common deep architectures. Moreover, PAUs pave the way to approximations with provable robustness. The source code can be found at https://github.com/ml-research/pau
@article{Molina2019,
abstract = {The performance of deep network learning strongly depends on the choice of the non-linear activation function associated with each neuron. However, deciding on the best activation is non-trivial and the choice depends on the architecture, hyper-parameters, and even on the dataset. Typically these activations are fixed by hand before training. Here, we demonstrate how to eliminate the reliance on first picking fixed activation functions by using flexible parametric rational functions instead. The resulting Pad\'e Activation Units (PAUs) can both approximate common activation functions and also learn new ones while providing compact representations. Our empirical evidence shows that end-to-end learning deep networks with PAUs can increase the predictive performance and reduce the training time of common deep architectures. Moreover, PAUs pave the way to approximations with provable robustness. The source code can be found at https://github.com/ml-research/pau},
added-at = {2020-10-15T14:36:56.000+0200},
author = {Molina, Alejandro and Schramowski, Patrick and Kersting, Kristian},
biburl = {https://www.bibsonomy.org/bibtex/2a50d77106bfbc3500d31e8272674329e/annakrause},
comment = {Paper presents rational functions for training activation functions.
Instead of standard Pade approximants, safe Pade approximants are used. This makes sure that the function does not have a pole.
The parameters of the SPA are trained with the network. But the orders of nominator and denominator have to be fixed beforehand and the parameters are initialized to match ReLU variants.
Personal Note: initializing the SPAs to match other activation functions makes me wonder, whether this takes biases the SPA towards the chosen activation function. Unfortunately, the authors do not provide experiments with randomly initialized SPA parameters.},
eprint = {http://arxiv.org/abs/1907.06732v1},
eprintclass = {cs.LG},
eprinttype = {arXiv},
file = {:http\://arxiv.org/pdf/1907.06732v1:PDF;:Molina_ea_PadeActivationUnits2019_annotated.pdf:PDF},
interhash = {5433750ab59a37a3b2ea7851accedff9},
intrahash = {a50d77106bfbc3500d31e8272674329e},
keywords = {Activation Units cs.LG, cs.NE,},
timestamp = {2020-10-15T14:44:42.000+0200},
title = {{Padé Activation Units: End-to-end Learning of Flexible Activation Functions in Deep Networks}},
year = 2019
}