Large-batch training is key to speeding up deep neural network training in
large distributed systems. However, large-batch training is difficult because
it produces a generalization gap. Straightforward optimization often leads to
accuracy loss on the test set. BERT devlin2018bert is a state-of-the-art
deep learning model that builds on top of deep bidirectional transformers for
language understanding. Previous large-batch training techniques do not perform
well for BERT when we scale the batch size (e.g. beyond 8192). BERT
pre-training also takes a long time to finish (around three days on 16 TPUv3
chips). To solve this problem, we propose the LAMB optimizer, which helps us to
scale the batch size to 65536 without losing accuracy. LAMB is a general
optimizer that works for both small and large batch sizes and does not need
hyper-parameter tuning besides the learning rate. The baseline BERT-Large model
needs 1 million iterations to finish pre-training, while LAMB with batch size
65536/32768 only needs 8599 iterations. We push the batch size to the memory
limit of a TPUv3 pod and can finish BERT training in 76 minutes.
Beschreibung
[1904.00962] Reducing BERT Pre-Training Time from 3 Days to 76 Minutes
%0 Generic
%1 you2019reducing
%A You, Yang
%A Li, Jing
%A Hseu, Jonathan
%A Song, Xiaodan
%A Demmel, James
%A Hsieh, Cho-Jui
%D 2019
%K BERT nlp
%T Reducing BERT Pre-Training Time from 3 Days to 76 Minutes
%U http://arxiv.org/abs/1904.00962
%X Large-batch training is key to speeding up deep neural network training in
large distributed systems. However, large-batch training is difficult because
it produces a generalization gap. Straightforward optimization often leads to
accuracy loss on the test set. BERT devlin2018bert is a state-of-the-art
deep learning model that builds on top of deep bidirectional transformers for
language understanding. Previous large-batch training techniques do not perform
well for BERT when we scale the batch size (e.g. beyond 8192). BERT
pre-training also takes a long time to finish (around three days on 16 TPUv3
chips). To solve this problem, we propose the LAMB optimizer, which helps us to
scale the batch size to 65536 without losing accuracy. LAMB is a general
optimizer that works for both small and large batch sizes and does not need
hyper-parameter tuning besides the learning rate. The baseline BERT-Large model
needs 1 million iterations to finish pre-training, while LAMB with batch size
65536/32768 only needs 8599 iterations. We push the batch size to the memory
limit of a TPUv3 pod and can finish BERT training in 76 minutes.
@misc{you2019reducing,
abstract = {Large-batch training is key to speeding up deep neural network training in
large distributed systems. However, large-batch training is difficult because
it produces a generalization gap. Straightforward optimization often leads to
accuracy loss on the test set. BERT \cite{devlin2018bert} is a state-of-the-art
deep learning model that builds on top of deep bidirectional transformers for
language understanding. Previous large-batch training techniques do not perform
well for BERT when we scale the batch size (e.g. beyond 8192). BERT
pre-training also takes a long time to finish (around three days on 16 TPUv3
chips). To solve this problem, we propose the LAMB optimizer, which helps us to
scale the batch size to 65536 without losing accuracy. LAMB is a general
optimizer that works for both small and large batch sizes and does not need
hyper-parameter tuning besides the learning rate. The baseline BERT-Large model
needs 1 million iterations to finish pre-training, while LAMB with batch size
65536/32768 only needs 8599 iterations. We push the batch size to the memory
limit of a TPUv3 pod and can finish BERT training in 76 minutes.},
added-at = {2019-05-20T14:01:23.000+0200},
author = {You, Yang and Li, Jing and Hseu, Jonathan and Song, Xiaodan and Demmel, James and Hsieh, Cho-Jui},
biburl = {https://www.bibsonomy.org/bibtex/2ee9da12b4734ac08a2af7c1ec8bd07d1/straybird321},
description = {[1904.00962] Reducing BERT Pre-Training Time from 3 Days to 76 Minutes},
interhash = {5563935c75d351cb94884b5a5b3eb15d},
intrahash = {ee9da12b4734ac08a2af7c1ec8bd07d1},
keywords = {BERT nlp},
note = {cite arxiv:1904.00962},
timestamp = {2019-05-20T14:01:23.000+0200},
title = {Reducing BERT Pre-Training Time from 3 Days to 76 Minutes},
url = {http://arxiv.org/abs/1904.00962},
year = 2019
}