Feature selection is a key step in Quantitative
Structure Activity Relationship (QSAR) analysis. Chance
correlations and multicollinearity are two major
problems often encountered when attempting to find
generalised QSAR models for use in drug design. Optimal
QSAR models require an objective variable relevance
analysis step for producing robust classifiers with low
complexity and good predictive accuracy. Genetic
algorithms coupled with information theoretic
approaches such as mutual information have been used to
find near-optimal solutions to such multicriteria
optimisation problems. In this paper, we describe a
novel approach for analyzing QSAR data based on these
methods. Our experiments with the Thrombin dataset,
previously studied as part of the KDD (Knowledge
Discovery and Data Mining) Cup 2001 demonstrate the
feasibility of this approach. It has been found that it
is important to take into account the data
distribution, the rule "interestingness", and the
need to look at more invariant and monotonic measures
of feature selection.
http://pubs.acs.org/journals/jcisd8/index.html
American Chemical Society S0095-2338(04)09933-0
School of Biological Sciences, University of Exeter,
Exeter EX4 4QF, Great Britain and School of Engineering
and Computer Science, University of Exeter, Exeter EX4
4QF, Great Britain
PMID: 15446827
%0 Journal Article
%1 venkatraman:2004:CIM
%A Venkatraman, Vishwesh
%A Dalby, Andrew Rowland
%A Yang, Zheng Rong
%D 2004
%J Journal of Chemical Information and Modeling
%K algorithms, genetic programming
%N 5
%P 1686--1692
%R doi:10.1021/ci049933v
%T Evaluation of Mutual Information and Genetic
Programming for Feature Selection in QSAR
%V 44
%X Feature selection is a key step in Quantitative
Structure Activity Relationship (QSAR) analysis. Chance
correlations and multicollinearity are two major
problems often encountered when attempting to find
generalised QSAR models for use in drug design. Optimal
QSAR models require an objective variable relevance
analysis step for producing robust classifiers with low
complexity and good predictive accuracy. Genetic
algorithms coupled with information theoretic
approaches such as mutual information have been used to
find near-optimal solutions to such multicriteria
optimisation problems. In this paper, we describe a
novel approach for analyzing QSAR data based on these
methods. Our experiments with the Thrombin dataset,
previously studied as part of the KDD (Knowledge
Discovery and Data Mining) Cup 2001 demonstrate the
feasibility of this approach. It has been found that it
is important to take into account the data
distribution, the rule "interestingness", and the
need to look at more invariant and monotonic measures
of feature selection.
@article{venkatraman:2004:CIM,
abstract = {Feature selection is a key step in Quantitative
Structure Activity Relationship (QSAR) analysis. Chance
correlations and multicollinearity are two major
problems often encountered when attempting to find
generalised QSAR models for use in drug design. Optimal
QSAR models require an objective variable relevance
analysis step for producing robust classifiers with low
complexity and good predictive accuracy. Genetic
algorithms coupled with information theoretic
approaches such as mutual information have been used to
find near-optimal solutions to such multicriteria
optimisation problems. In this paper, we describe a
novel approach for analyzing QSAR data based on these
methods. Our experiments with the Thrombin dataset,
previously studied as part of the KDD (Knowledge
Discovery and Data Mining) Cup 2001 demonstrate the
feasibility of this approach. It has been found that it
is important to take into account the data
distribution, the rule {"}interestingness{"}, and the
need to look at more invariant and monotonic measures
of feature selection.},
added-at = {2008-06-19T17:46:40.000+0200},
author = {Venkatraman, Vishwesh and Dalby, Andrew Rowland and Yang, Zheng Rong},
biburl = {https://www.bibsonomy.org/bibtex/25ed323ebbeff9cf59306434dde0d25b6/brazovayeye},
doi = {doi:10.1021/ci049933v},
interhash = {08424be7b38c7613e128df8537bbb182},
intrahash = {5ed323ebbeff9cf59306434dde0d25b6},
journal = {Journal of Chemical Information and Modeling},
keywords = {algorithms, genetic programming},
notes = {http://pubs.acs.org/journals/jcisd8/index.html
American Chemical Society S0095-2338(04)09933-0
School of Biological Sciences, University of Exeter,
Exeter EX4 4QF, Great Britain and School of Engineering
and Computer Science, University of Exeter, Exeter EX4
4QF, Great Britain
PMID: 15446827},
number = 5,
pages = {1686--1692},
timestamp = {2008-06-19T17:53:39.000+0200},
title = {Evaluation of Mutual Information and Genetic
Programming for Feature Selection in {QSAR}},
volume = 44,
year = 2004
}