We present a unifying framework for information theoretic feature selection, bringing almost two decades of research on heuristic filter criteria under a single theoretical interpretation. This is in response to the question: <i>"what are the implicit statistical assumptions of feature selection criteria based on mutual information?"</i>. To answer this, we adopt a different strategy than is usual in the feature selection literature−instead of trying to <i>define</i> a criterion, we <i>derive</i> one, directly from a clearly specified objective function: the conditional likelihood of the training labels. While many hand-designed heuristic criteria try to optimize a definition of feature 'relevancy' and 'redundancy', our approach leads to a probabilistic framework which naturally incorporates these concepts. As a result we can unify the numerous criteria published over the last two decades, and show them to be low-order approximations to the exact (but intractable) optimisation problem. The primary contribution is to show that <i>common heuristics for information based feature selection (including Markov Blanket algorithms as a special case) are approximate iterative maximisers of the conditional likelihood.</i> A large empirical study provides strong evidence to favour certain classes of criteria, in particular those that balance the relative size of the relevancy/redundancy terms. Overall we conclude that the JMI criterion (Yang and Moody, 1999; Meyer et al., 2008) provides the best tradeoff in terms of accuracy, stability, and flexibility with small data samples.
%0 Conference Proceedings
%1 Brown2012
%A Brown, Gavin
%A Pocock, Adam
%A Zhao, Ming-Jie
%A Luján, Mikel
%D 2012
%K conditional_likelihood entropy feature_selection information_theory mutual_information
%T Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection
%U http://jmlr.csail.mit.edu/papers/v13/brown12a.html
%V v13
%X We present a unifying framework for information theoretic feature selection, bringing almost two decades of research on heuristic filter criteria under a single theoretical interpretation. This is in response to the question: <i>"what are the implicit statistical assumptions of feature selection criteria based on mutual information?"</i>. To answer this, we adopt a different strategy than is usual in the feature selection literature−instead of trying to <i>define</i> a criterion, we <i>derive</i> one, directly from a clearly specified objective function: the conditional likelihood of the training labels. While many hand-designed heuristic criteria try to optimize a definition of feature 'relevancy' and 'redundancy', our approach leads to a probabilistic framework which naturally incorporates these concepts. As a result we can unify the numerous criteria published over the last two decades, and show them to be low-order approximations to the exact (but intractable) optimisation problem. The primary contribution is to show that <i>common heuristics for information based feature selection (including Markov Blanket algorithms as a special case) are approximate iterative maximisers of the conditional likelihood.</i> A large empirical study provides strong evidence to favour certain classes of criteria, in particular those that balance the relative size of the relevancy/redundancy terms. Overall we conclude that the JMI criterion (Yang and Moody, 1999; Meyer et al., 2008) provides the best tradeoff in terms of accuracy, stability, and flexibility with small data samples.
@proceedings{Brown2012,
abstract = {
We present a unifying framework for information theoretic feature selection, bringing almost two decades of research on heuristic filter criteria under a single theoretical interpretation. This is in response to the question: <i>"what are the implicit statistical assumptions of feature selection criteria based on mutual information?"</i>. To answer this, we adopt a different strategy than is usual in the feature selection literature−instead of trying to <i>define</i> a criterion, we <i>derive</i> one, directly from a clearly specified objective function: the conditional likelihood of the training labels. While many hand-designed heuristic criteria try to optimize a definition of feature 'relevancy' and 'redundancy', our approach leads to a probabilistic framework which naturally incorporates these concepts. As a result we can unify the numerous criteria published over the last two decades, and show them to be low-order approximations to the exact (but intractable) optimisation problem. The primary contribution is to show that <i>common heuristics for information based feature selection (including Markov Blanket algorithms as a special case) are approximate iterative maximisers of the conditional likelihood.</i> A large empirical study provides strong evidence to favour certain classes of criteria, in particular those that balance the relative size of the relevancy/redundancy terms. Overall we conclude that the JMI criterion (Yang and Moody, 1999; Meyer et al., 2008) provides the best tradeoff in terms of accuracy, stability, and flexibility with small data samples.
},
added-at = {2017-07-24T21:44:03.000+0200},
author = {Brown, Gavin and Pocock, Adam and Zhao, Ming-Jie and Luján, Mikel},
biburl = {https://www.bibsonomy.org/bibtex/2a27ae94825da49eafe6050a275ee526c/suqbar},
interhash = {b4009d4a2af74b220d05a124f85204d5},
intrahash = {a27ae94825da49eafe6050a275ee526c},
keywords = {conditional_likelihood entropy feature_selection information_theory mutual_information},
page = {27−66},
timestamp = {2017-07-24T21:44:03.000+0200},
title = { Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection },
url = {http://jmlr.csail.mit.edu/papers/v13/brown12a.html},
volume = {v13},
year = 2012
}