We study the problem of injecting knowledge into large pre-trained models
like BERT and RoBERTa. Existing methods typically update the original
parameters of pre-trained models when injecting knowledge. However, when
multiple kinds of knowledge are injected, they may suffer from the problem of
catastrophic forgetting. To address this, we propose K-Adapter, which remains
the original parameters of the pre-trained model fixed and supports continual
knowledge infusion. Taking RoBERTa as the pre-trained model, K-Adapter has a
neural adapter for each kind of infused knowledge, like a plug-in connected to
RoBERTa. There is no information flow between different adapters, thus
different adapters are efficiently trained in a distributed way. We inject two
kinds of knowledge, including factual knowledge obtained from automatically
aligned text-triplets on Wikipedia and Wikidata, and linguistic knowledge
obtained from dependency parsing. Results on three knowledge-driven tasks
(total six datasets) including relation classification, entity typing and
question answering demonstrate that each adapter improves the performance, and
the combination of both adapters brings further improvements. Probing
experiments further show that K-Adapter captures richer factual and commonsense
knowledge than RoBERTa.
Description
K-Adapter: Infusing Knowledge into Pre-Trained Models with Adapters
%0 Generic
%1 wang2020kadapter
%A Wang, Ruize
%A Tang, Duyu
%A Duan, Nan
%A Wei, Zhongyu
%A Huang, Xuanjing
%A ji, Jianshu
%A Cao, Guihong
%A Jiang, Daxin
%A Zhou, Ming
%D 2020
%K deep forgetting knowledge learning nlp toread
%T K-Adapter: Infusing Knowledge into Pre-Trained Models with Adapters
%U http://arxiv.org/abs/2002.01808
%X We study the problem of injecting knowledge into large pre-trained models
like BERT and RoBERTa. Existing methods typically update the original
parameters of pre-trained models when injecting knowledge. However, when
multiple kinds of knowledge are injected, they may suffer from the problem of
catastrophic forgetting. To address this, we propose K-Adapter, which remains
the original parameters of the pre-trained model fixed and supports continual
knowledge infusion. Taking RoBERTa as the pre-trained model, K-Adapter has a
neural adapter for each kind of infused knowledge, like a plug-in connected to
RoBERTa. There is no information flow between different adapters, thus
different adapters are efficiently trained in a distributed way. We inject two
kinds of knowledge, including factual knowledge obtained from automatically
aligned text-triplets on Wikipedia and Wikidata, and linguistic knowledge
obtained from dependency parsing. Results on three knowledge-driven tasks
(total six datasets) including relation classification, entity typing and
question answering demonstrate that each adapter improves the performance, and
the combination of both adapters brings further improvements. Probing
experiments further show that K-Adapter captures richer factual and commonsense
knowledge than RoBERTa.
@misc{wang2020kadapter,
abstract = {We study the problem of injecting knowledge into large pre-trained models
like BERT and RoBERTa. Existing methods typically update the original
parameters of pre-trained models when injecting knowledge. However, when
multiple kinds of knowledge are injected, they may suffer from the problem of
catastrophic forgetting. To address this, we propose K-Adapter, which remains
the original parameters of the pre-trained model fixed and supports continual
knowledge infusion. Taking RoBERTa as the pre-trained model, K-Adapter has a
neural adapter for each kind of infused knowledge, like a plug-in connected to
RoBERTa. There is no information flow between different adapters, thus
different adapters are efficiently trained in a distributed way. We inject two
kinds of knowledge, including factual knowledge obtained from automatically
aligned text-triplets on Wikipedia and Wikidata, and linguistic knowledge
obtained from dependency parsing. Results on three knowledge-driven tasks
(total six datasets) including relation classification, entity typing and
question answering demonstrate that each adapter improves the performance, and
the combination of both adapters brings further improvements. Probing
experiments further show that K-Adapter captures richer factual and commonsense
knowledge than RoBERTa.},
added-at = {2020-05-06T10:27:29.000+0200},
author = {Wang, Ruize and Tang, Duyu and Duan, Nan and Wei, Zhongyu and Huang, Xuanjing and ji, Jianshu and Cao, Guihong and Jiang, Daxin and Zhou, Ming},
biburl = {https://www.bibsonomy.org/bibtex/235d80999c453f2f984bbe85ba387d2ab/hotho},
description = {K-Adapter: Infusing Knowledge into Pre-Trained Models with Adapters},
interhash = {42b3154aea4b5ed945ec9c65869c2128},
intrahash = {35d80999c453f2f984bbe85ba387d2ab},
keywords = {deep forgetting knowledge learning nlp toread},
note = {cite arxiv:2002.01808},
timestamp = {2020-05-06T10:27:29.000+0200},
title = {K-Adapter: Infusing Knowledge into Pre-Trained Models with Adapters},
url = {http://arxiv.org/abs/2002.01808},
year = 2020
}