We propose approaching prediction from a framework grounded in the theoretical correct prediction rate of a variable set as a parameter of interest. This framework allows us to define a measure of predictivity that enables assessing variable sets for, preferably high, predictivity. We first define the prediction rate for a variable set and consider, and ultimately reject, the naive estimator, a statistic based on the observed sample data, due to its inflated bias for moderate sample size and its sensitivity to noisy useless variables. We demonstrate that the I-score of the PR method of VS yields a relatively unbiased estimate of a parameter that is not sensitive to noisy variables and is a lower bound to the parameter of interest. Thus, the PR method using the I-score provides an effective approach to selecting highly predictive variables. We offer simulations and an application of the I-score on real data to demonstrate the statistic's predictive performance on sample data. We conjecture that using the partition retention and I-score can aid in finding variable sets with promising prediction rates; however, further research in the avenue of sample-based measures of predictivity is much desired.
%0 Journal Article
%1 LoChernoffEtAl16pnas
%A Lo, Adeline
%A Chernoff, Herman
%A Zheng, Tian
%A Lo, Shaw-Hwa
%D 2016
%J Proceedings of the National Academy of Sciences
%K 01624 springer paper science theory optimize data analysis zzz.big
%N 50
%P 14277--14282
%R 10.1073/pnas.1616647113
%T Framework for Making Better Predictions by Directly Estimating Variables' Predictivity
%V 113
%X We propose approaching prediction from a framework grounded in the theoretical correct prediction rate of a variable set as a parameter of interest. This framework allows us to define a measure of predictivity that enables assessing variable sets for, preferably high, predictivity. We first define the prediction rate for a variable set and consider, and ultimately reject, the naive estimator, a statistic based on the observed sample data, due to its inflated bias for moderate sample size and its sensitivity to noisy useless variables. We demonstrate that the I-score of the PR method of VS yields a relatively unbiased estimate of a parameter that is not sensitive to noisy variables and is a lower bound to the parameter of interest. Thus, the PR method using the I-score provides an effective approach to selecting highly predictive variables. We offer simulations and an application of the I-score on real data to demonstrate the statistic's predictive performance on sample data. We conjecture that using the partition retention and I-score can aid in finding variable sets with promising prediction rates; however, further research in the avenue of sample-based measures of predictivity is much desired.
@article{LoChernoffEtAl16pnas,
abstract = {We propose approaching prediction from a framework grounded in the theoretical correct prediction rate of a variable set as a parameter of interest. This framework allows us to define a measure of predictivity that enables assessing variable sets for, preferably high, predictivity. We first define the prediction rate for a variable set and consider, and ultimately reject, the naive estimator, a statistic based on the observed sample data, due to its inflated bias for moderate sample size and its sensitivity to noisy useless variables. We demonstrate that the I-score of the PR method of VS yields a relatively unbiased estimate of a parameter that is not sensitive to noisy variables and is a lower bound to the parameter of interest. Thus, the PR method using the I-score provides an effective approach to selecting highly predictive variables. We offer simulations and an application of the I-score on real data to demonstrate the statistic's predictive performance on sample data. We conjecture that using the partition retention and I-score can aid in finding variable sets with promising prediction rates; however, further research in the avenue of sample-based measures of predictivity is much desired.},
added-at = {2016-12-17T18:40:04.000+0100},
author = {Lo, Adeline and Chernoff, Herman and Zheng, Tian and Lo, Shaw-Hwa},
biburl = {https://www.bibsonomy.org/bibtex/286cd43ff316ad62faad9819fd261c00a/flint63},
doi = {10.1073/pnas.1616647113},
file = {SpringerLink:2016/LoChernoffEtAl16pnas.pdf:PDF},
groups = {public},
interhash = {5ee6a56b46e003157299a3e3d4552ac8},
intrahash = {86cd43ff316ad62faad9819fd261c00a},
issn = {0027-8424},
journal = {Proceedings of the National Academy of Sciences},
keywords = {01624 springer paper science theory optimize data analysis zzz.big},
number = 50,
pages = {14277--14282},
timestamp = {2017-07-13T17:22:54.000+0200},
title = {Framework for Making Better Predictions by Directly Estimating Variables' Predictivity},
username = {flint63},
volume = 113,
year = 2016
}