Asynchronous distributed machine learning solutions have proven very
effective so far, but always assuming perfectly functioning workers. In
practice, some of the workers can however exhibit Byzantine behavior, caused by
hardware failures, software bugs, corrupt data, or even malicious attacks. We
introduce Kardam, the first distributed asynchronous stochastic gradient
descent (SGD) algorithm that copes with Byzantine workers. Kardam consists of
two complementary components: a filtering and a dampening component. The first
is scalar-based and ensures resilience against $13$ Byzantine workers.
Essentially, this filter leverages the Lipschitzness of cost functions and acts
as a self-stabilizer against Byzantine workers that would attempt to corrupt
the progress of SGD. The dampening component bounds the convergence rate by
adjusting to stale information through a generic gradient weighting scheme. We
prove that Kardam guarantees almost sure convergence in the presence of
asynchrony and Byzantine behavior, and we derive its convergence rate. We
evaluate Kardam on the CIFAR-100 and EMNIST datasets and measure its overhead
with respect to non Byzantine-resilient solutions. We empirically show that
Kardam does not introduce additional noise to the learning procedure but does
induce a slowdown (the cost of Byzantine resilience) that we both theoretically
and empirically show to be less than $f/n$, where $f$ is the number of
Byzantine failures tolerated and $n$ the total number of workers.
Interestingly, we also empirically observe that the dampening component is
interesting in its own right for it enables to build an SGD algorithm that
outperforms alternative staleness-aware asynchronous competitors in
environments with honest workers.
%0 Generic
%1 damaskinos2018asynchronous
%A Damaskinos, Georgios
%A Mhamdi, El Mahdi El
%A Guerraoui, Rachid
%A Patra, Rhicheek
%A Taziki, Mahsa
%D 2018
%K distributed optimization
%T Asynchronous Byzantine Machine Learning
%U http://arxiv.org/abs/1802.07928
%X Asynchronous distributed machine learning solutions have proven very
effective so far, but always assuming perfectly functioning workers. In
practice, some of the workers can however exhibit Byzantine behavior, caused by
hardware failures, software bugs, corrupt data, or even malicious attacks. We
introduce Kardam, the first distributed asynchronous stochastic gradient
descent (SGD) algorithm that copes with Byzantine workers. Kardam consists of
two complementary components: a filtering and a dampening component. The first
is scalar-based and ensures resilience against $13$ Byzantine workers.
Essentially, this filter leverages the Lipschitzness of cost functions and acts
as a self-stabilizer against Byzantine workers that would attempt to corrupt
the progress of SGD. The dampening component bounds the convergence rate by
adjusting to stale information through a generic gradient weighting scheme. We
prove that Kardam guarantees almost sure convergence in the presence of
asynchrony and Byzantine behavior, and we derive its convergence rate. We
evaluate Kardam on the CIFAR-100 and EMNIST datasets and measure its overhead
with respect to non Byzantine-resilient solutions. We empirically show that
Kardam does not introduce additional noise to the learning procedure but does
induce a slowdown (the cost of Byzantine resilience) that we both theoretically
and empirically show to be less than $f/n$, where $f$ is the number of
Byzantine failures tolerated and $n$ the total number of workers.
Interestingly, we also empirically observe that the dampening component is
interesting in its own right for it enables to build an SGD algorithm that
outperforms alternative staleness-aware asynchronous competitors in
environments with honest workers.
@misc{damaskinos2018asynchronous,
abstract = {Asynchronous distributed machine learning solutions have proven very
effective so far, but always assuming perfectly functioning workers. In
practice, some of the workers can however exhibit Byzantine behavior, caused by
hardware failures, software bugs, corrupt data, or even malicious attacks. We
introduce \emph{Kardam}, the first distributed asynchronous stochastic gradient
descent (SGD) algorithm that copes with Byzantine workers. Kardam consists of
two complementary components: a filtering and a dampening component. The first
is scalar-based and ensures resilience against $\frac{1}{3}$ Byzantine workers.
Essentially, this filter leverages the Lipschitzness of cost functions and acts
as a self-stabilizer against Byzantine workers that would attempt to corrupt
the progress of SGD. The dampening component bounds the convergence rate by
adjusting to stale information through a generic gradient weighting scheme. We
prove that Kardam guarantees almost sure convergence in the presence of
asynchrony and Byzantine behavior, and we derive its convergence rate. We
evaluate Kardam on the CIFAR-100 and EMNIST datasets and measure its overhead
with respect to non Byzantine-resilient solutions. We empirically show that
Kardam does not introduce additional noise to the learning procedure but does
induce a slowdown (the cost of Byzantine resilience) that we both theoretically
and empirically show to be less than $f/n$, where $f$ is the number of
Byzantine failures tolerated and $n$ the total number of workers.
Interestingly, we also empirically observe that the dampening component is
interesting in its own right for it enables to build an SGD algorithm that
outperforms alternative staleness-aware asynchronous competitors in
environments with honest workers.},
added-at = {2018-02-23T10:54:18.000+0100},
author = {Damaskinos, Georgios and Mhamdi, El Mahdi El and Guerraoui, Rachid and Patra, Rhicheek and Taziki, Mahsa},
biburl = {https://www.bibsonomy.org/bibtex/22aae7d848dff1357d26d90cb71e50b07/jk_itwm},
description = {Asynchronous Byzantine Machine Learning},
interhash = {74e5d093557569770d62fd9738e903c2},
intrahash = {2aae7d848dff1357d26d90cb71e50b07},
keywords = {distributed optimization},
note = {cite arxiv:1802.07928},
timestamp = {2018-02-23T10:54:18.000+0100},
title = {Asynchronous Byzantine Machine Learning},
url = {http://arxiv.org/abs/1802.07928},
year = 2018
}