We propose approaching prediction from a framework grounded in the theoretical correct prediction rate of a variable set as a parameter of interest. This framework allows us to define a measure of predictivity that enables assessing variable sets for, preferably high, predictivity. We first define the prediction rate for a variable set and consider, and ultimately reject, the naive estimator, a statistic based on the observed sample data, due to its inflated bias for moderate sample size and its sensitivity to noisy useless variables. We demonstrate that the II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score of the PR method of VS yields a relatively unbiased estimate of a parameter that is not sensitive to noisy variables and is a lower bound to the parameter of interest. Thus, the PR method using the II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score provides an effective approach to selecting highly predictive variables. We offer simulations and an application of the II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score on real data to demonstrate the statistic’s predictive performance on sample data. We conjecture that using the partition retention and II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score can aid in finding variable sets with promising prediction rates; however, further research in the avenue of sample-based measures of predictivity is much desired.
%0 Journal Article
%1 lo_framework_2016
%A Lo, Adeline
%A Chernoff, Herman
%A Zheng, Tian
%A Lo, Shaw-Hwa
%D 2016
%J Proceedings of the National Academy of Sciences
%K SNP, data, genomics, high-dimensional prediction, predictivity, selection variable
%N 50
%P 14277--14282
%R 10.1073/pnas.1616647113
%T Framework for making better predictions by directly estimating variables’ predictivity
%U http://www.pnas.org/content/113/50/14277
%V 113
%X We propose approaching prediction from a framework grounded in the theoretical correct prediction rate of a variable set as a parameter of interest. This framework allows us to define a measure of predictivity that enables assessing variable sets for, preferably high, predictivity. We first define the prediction rate for a variable set and consider, and ultimately reject, the naive estimator, a statistic based on the observed sample data, due to its inflated bias for moderate sample size and its sensitivity to noisy useless variables. We demonstrate that the II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score of the PR method of VS yields a relatively unbiased estimate of a parameter that is not sensitive to noisy variables and is a lower bound to the parameter of interest. Thus, the PR method using the II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score provides an effective approach to selecting highly predictive variables. We offer simulations and an application of the II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score on real data to demonstrate the statistic’s predictive performance on sample data. We conjecture that using the partition retention and II\textlessmml:math\textgreater\textlessmml:mi\textgreaterI\textless/mml:mi\textgreater\textless/mml:math\textgreater-score can aid in finding variable sets with promising prediction rates; however, further research in the avenue of sample-based measures of predictivity is much desired.
@article{lo_framework_2016,
abstract = {We propose approaching prediction from a framework grounded in the theoretical correct prediction rate of a variable set as a parameter of interest. This framework allows us to define a measure of predictivity that enables assessing variable sets for, preferably high, predictivity. We first define the prediction rate for a variable set and consider, and ultimately reject, the naive estimator, a statistic based on the observed sample data, due to its inflated bias for moderate sample size and its sensitivity to noisy useless variables. We demonstrate that the II{\textless}mml:math{\textgreater}{\textless}mml:mi{\textgreater}I{\textless}/mml:mi{\textgreater}{\textless}/mml:math{\textgreater}-score of the PR method of VS yields a relatively unbiased estimate of a parameter that is not sensitive to noisy variables and is a lower bound to the parameter of interest. Thus, the PR method using the II{\textless}mml:math{\textgreater}{\textless}mml:mi{\textgreater}I{\textless}/mml:mi{\textgreater}{\textless}/mml:math{\textgreater}-score provides an effective approach to selecting highly predictive variables. We offer simulations and an application of the II{\textless}mml:math{\textgreater}{\textless}mml:mi{\textgreater}I{\textless}/mml:mi{\textgreater}{\textless}/mml:math{\textgreater}-score on real data to demonstrate the statistic’s predictive performance on sample data. We conjecture that using the partition retention and II{\textless}mml:math{\textgreater}{\textless}mml:mi{\textgreater}I{\textless}/mml:mi{\textgreater}{\textless}/mml:math{\textgreater}-score can aid in finding variable sets with promising prediction rates; however, further research in the avenue of sample-based measures of predictivity is much desired.},
added-at = {2017-01-09T13:57:26.000+0100},
author = {Lo, Adeline and Chernoff, Herman and Zheng, Tian and Lo, Shaw-Hwa},
biburl = {https://www.bibsonomy.org/bibtex/2f4452b1827950acd7c795812aa14afb2/yourwelcome},
doi = {10.1073/pnas.1616647113},
interhash = {5ee6a56b46e003157299a3e3d4552ac8},
intrahash = {f4452b1827950acd7c795812aa14afb2},
issn = {0027-8424, 1091-6490},
journal = {Proceedings of the National Academy of Sciences},
keywords = {SNP, data, genomics, high-dimensional prediction, predictivity, selection variable},
language = {en},
month = dec,
number = 50,
pages = {14277--14282},
pmid = {27911830},
timestamp = {2017-01-09T14:01:11.000+0100},
title = {Framework for making better predictions by directly estimating variables’ predictivity},
url = {http://www.pnas.org/content/113/50/14277},
urldate = {2017-01-02},
volume = 113,
year = 2016
}