When training and evaluating machine learning models on a large number of
tasks, it is important to not only look at average task accuracy -- which may
be biased by easy or redundant tasks -- but also worst-case accuracy (i.e. the
performance on the task with the lowest accuracy). In this work, we show how to
use techniques from the distributionally robust optimization (DRO) literature
to improve worst-case performance in multitask learning. We highlight several
failure cases of DRO when applied off-the-shelf and present an improved method,
Lookahead-DRO (L-DRO), which mitigates these issues. The core idea of L-DRO is
to anticipate the interaction between tasks during training in order to choose
a dynamic re-weighting of the various task losses, which will (i) lead to
minimal worst-case loss and (ii) train on as many tasks as possible. After
demonstrating the efficacy of L-DRO on a small controlled synthetic setting, we
evaluate it on two realistic benchmarks: a multitask version of the CIFAR-100
image classification dataset and a large-scale multilingual language modeling
experiment. Our empirical results show that L-DRO achieves a better trade-off
between average and worst-case accuracy with little computational overhead
compared to several strong baselines.
%0 Generic
%1 michel2021balancing
%A Michel, Paul
%A Ruder, Sebastian
%A Yogatama, Dani
%D 2021
%K multitask
%T Balancing Average and Worst-case Accuracy in Multitask Learning
%U http://arxiv.org/abs/2110.05838
%X When training and evaluating machine learning models on a large number of
tasks, it is important to not only look at average task accuracy -- which may
be biased by easy or redundant tasks -- but also worst-case accuracy (i.e. the
performance on the task with the lowest accuracy). In this work, we show how to
use techniques from the distributionally robust optimization (DRO) literature
to improve worst-case performance in multitask learning. We highlight several
failure cases of DRO when applied off-the-shelf and present an improved method,
Lookahead-DRO (L-DRO), which mitigates these issues. The core idea of L-DRO is
to anticipate the interaction between tasks during training in order to choose
a dynamic re-weighting of the various task losses, which will (i) lead to
minimal worst-case loss and (ii) train on as many tasks as possible. After
demonstrating the efficacy of L-DRO on a small controlled synthetic setting, we
evaluate it on two realistic benchmarks: a multitask version of the CIFAR-100
image classification dataset and a large-scale multilingual language modeling
experiment. Our empirical results show that L-DRO achieves a better trade-off
between average and worst-case accuracy with little computational overhead
compared to several strong baselines.
@misc{michel2021balancing,
abstract = {When training and evaluating machine learning models on a large number of
tasks, it is important to not only look at average task accuracy -- which may
be biased by easy or redundant tasks -- but also worst-case accuracy (i.e. the
performance on the task with the lowest accuracy). In this work, we show how to
use techniques from the distributionally robust optimization (DRO) literature
to improve worst-case performance in multitask learning. We highlight several
failure cases of DRO when applied off-the-shelf and present an improved method,
Lookahead-DRO (L-DRO), which mitigates these issues. The core idea of L-DRO is
to anticipate the interaction between tasks during training in order to choose
a dynamic re-weighting of the various task losses, which will (i) lead to
minimal worst-case loss and (ii) train on as many tasks as possible. After
demonstrating the efficacy of L-DRO on a small controlled synthetic setting, we
evaluate it on two realistic benchmarks: a multitask version of the CIFAR-100
image classification dataset and a large-scale multilingual language modeling
experiment. Our empirical results show that L-DRO achieves a better trade-off
between average and worst-case accuracy with little computational overhead
compared to several strong baselines.},
added-at = {2021-10-15T09:48:00.000+0200},
author = {Michel, Paul and Ruder, Sebastian and Yogatama, Dani},
biburl = {https://www.bibsonomy.org/bibtex/2e0e75595c64685caf5860ae550a553e9/topel},
interhash = {a0425de74cd6d26b92143ffeb3cf74f1},
intrahash = {e0e75595c64685caf5860ae550a553e9},
keywords = {multitask},
note = {cite arxiv:2110.05838Comment: Under review},
timestamp = {2021-10-15T09:48:00.000+0200},
title = {Balancing Average and Worst-case Accuracy in Multitask Learning},
url = {http://arxiv.org/abs/2110.05838},
year = 2021
}