The interpretation of deep learning models is a challenge due to their size, complexity, and often opaque internal state. In addition, many systems, such as image classifiers, operate on low-level features rather than high-level concepts. To address these challenges, we introduce Concept Activation Vectors (CAVs), which provide an interpretation of a neural net’s internal state in terms of human-friendly concepts. The key idea is to view the high-dimensional internal state of a neural net as an aid, not an obstacle. We show how to use CAVs as part of a technique, Testing with CAVs (TCAV), that uses directional derivatives to quantify the degree to which a user-defined concept is important to a classification result–for example, how sensitive a prediction of “zebra” is to the presence of stripes. Using the domain of image classification as a testing ground, we describe how CAVs may be used to explore hypotheses and generate insights for a standard image classification network as well as a medical application.
%0 Conference Paper
%1 kim2018interpretability
%A Kim, Been
%A Wattenberg, Martin
%A Gilmer, Justin
%A Cai, Carrie
%A Wexler, James
%A Viegas, Fernanda
%A sayres, Rory
%B Proceedings of the 35th International Conference on Machine Learning
%D 2018
%E Dy, Jennifer
%E Krause, Andreas
%I PMLR
%K deeplearning explainability explainable interpretability interpretable learning machine ml model
%P 2668-2677
%T Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors (TCAV)
%U http://proceedings.mlr.press/v80/kim18d.html
%V 80
%X The interpretation of deep learning models is a challenge due to their size, complexity, and often opaque internal state. In addition, many systems, such as image classifiers, operate on low-level features rather than high-level concepts. To address these challenges, we introduce Concept Activation Vectors (CAVs), which provide an interpretation of a neural net’s internal state in terms of human-friendly concepts. The key idea is to view the high-dimensional internal state of a neural net as an aid, not an obstacle. We show how to use CAVs as part of a technique, Testing with CAVs (TCAV), that uses directional derivatives to quantify the degree to which a user-defined concept is important to a classification result–for example, how sensitive a prediction of “zebra” is to the presence of stripes. Using the domain of image classification as a testing ground, we describe how CAVs may be used to explore hypotheses and generate insights for a standard image classification network as well as a medical application.
@inproceedings{kim2018interpretability,
abstract = {The interpretation of deep learning models is a challenge due to their size, complexity, and often opaque internal state. In addition, many systems, such as image classifiers, operate on low-level features rather than high-level concepts. To address these challenges, we introduce Concept Activation Vectors (CAVs), which provide an interpretation of a neural net’s internal state in terms of human-friendly concepts. The key idea is to view the high-dimensional internal state of a neural net as an aid, not an obstacle. We show how to use CAVs as part of a technique, Testing with CAVs (TCAV), that uses directional derivatives to quantify the degree to which a user-defined concept is important to a classification result–for example, how sensitive a prediction of “zebra” is to the presence of stripes. Using the domain of image classification as a testing ground, we describe how CAVs may be used to explore hypotheses and generate insights for a standard image classification network as well as a medical application. },
added-at = {2021-06-08T08:57:05.000+0200},
author = {Kim, Been and Wattenberg, Martin and Gilmer, Justin and Cai, Carrie and Wexler, James and Viegas, Fernanda and sayres, Rory},
biburl = {https://www.bibsonomy.org/bibtex/2fc6b36a99fe6876dd707a053ffcf362d/jaeschke},
booktitle = {Proceedings of the 35th International Conference on Machine Learning},
description = {Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors (TCAV)},
editor = {Dy, Jennifer and Krause, Andreas},
interhash = {94270382e586abd71f4c04ecc3868361},
intrahash = {fc6b36a99fe6876dd707a053ffcf362d},
issn = {2640-3498},
keywords = {deeplearning explainability explainable interpretability interpretable learning machine ml model},
pages = {2668-2677},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
timestamp = {2021-06-08T08:57:05.000+0200},
title = {Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors (TCAV)},
url = {http://proceedings.mlr.press/v80/kim18d.html},
volume = 80,
year = 2018
}