We define the relevant information in a signal x 2 X as being the information that this signal provides about another signal y 2 Y . Examples include the information that face images provide about the names of the people portrayed, or the information that speech sounds provide about the words spoken. Understanding the signal x requires more than just predicting y, it also requires specifying which features of X play a role in the prediction. We formalize the problem as that of finding a short code for X that preserves the maximum information about Y . That is, we squeeze the information that X provides about Y through a `bottleneck ' formed by a limited set of codewords ~ X. This constrained optimization problem can be seen as a generalization of rate distortion theory in which the distortion measure d(x; ~ x) emerges from the joint statistics of X and Y . The approach yields an exact set of self-consistent equations for the coding rules X ! ~ X and ~ X ! Y . Solutions to these equations can be found by a convergent re-estimation method that generalizes the Blahut-Arimoto algorithm. Our variational principle provides a surprisingly rich framework for discussing a variety of problems in signal processing and learning, as will be described in detail elsewhere.
%0 Conference Paper
%1 Tishby99theinformation
%A Tishby, Naftali
%A Pereira, Fernando C.
%A Bialek, William
%D 1999
%K 2009 bottleneck clustering co-clustering information seminar
%P 368--377
%T The Information Bottleneck Method
%U http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.9199
%X We define the relevant information in a signal x 2 X as being the information that this signal provides about another signal y 2 Y . Examples include the information that face images provide about the names of the people portrayed, or the information that speech sounds provide about the words spoken. Understanding the signal x requires more than just predicting y, it also requires specifying which features of X play a role in the prediction. We formalize the problem as that of finding a short code for X that preserves the maximum information about Y . That is, we squeeze the information that X provides about Y through a `bottleneck ' formed by a limited set of codewords ~ X. This constrained optimization problem can be seen as a generalization of rate distortion theory in which the distortion measure d(x; ~ x) emerges from the joint statistics of X and Y . The approach yields an exact set of self-consistent equations for the coding rules X ! ~ X and ~ X ! Y . Solutions to these equations can be found by a convergent re-estimation method that generalizes the Blahut-Arimoto algorithm. Our variational principle provides a surprisingly rich framework for discussing a variety of problems in signal processing and learning, as will be described in detail elsewhere.
@inproceedings{Tishby99theinformation,
abstract = {We define the relevant information in a signal x 2 X as being the information that this signal provides about another signal y 2 Y . Examples include the information that face images provide about the names of the people portrayed, or the information that speech sounds provide about the words spoken. Understanding the signal x requires more than just predicting y, it also requires specifying which features of X play a role in the prediction. We formalize the problem as that of finding a short code for X that preserves the maximum information about Y . That is, we squeeze the information that X provides about Y through a `bottleneck ' formed by a limited set of codewords ~ X. This constrained optimization problem can be seen as a generalization of rate distortion theory in which the distortion measure d(x; ~ x) emerges from the joint statistics of X and Y . The approach yields an exact set of self-consistent equations for the coding rules X ! ~ X and ~ X ! Y . Solutions to these equations can be found by a convergent re-estimation method that generalizes the Blahut-Arimoto algorithm. Our variational principle provides a surprisingly rich framework for discussing a variety of problems in signal processing and learning, as will be described in detail elsewhere.},
added-at = {2009-12-14T01:14:33.000+0100},
author = {Tishby, Naftali and Pereira, Fernando C. and Bialek, William},
biburl = {https://www.bibsonomy.org/bibtex/29f59d7281b6f02200fc0710624ee3b48/r.b.},
description = {The Information Bottleneck Method},
interhash = {15bd5efbf394791da00b09839b9a5757},
intrahash = {9f59d7281b6f02200fc0710624ee3b48},
keywords = {2009 bottleneck clustering co-clustering information seminar},
pages = {368--377},
timestamp = {2009-12-14T01:14:33.000+0100},
title = {The Information Bottleneck Method},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.9199},
year = 1999
}