An important issue in ecohydrological research is distribution modelling,
aiming at the prediction of species or vegetation type occurrence
on the basis of empirical relations with hydrological or hydrogeochemical
habitat conditions. In this study, two statistical techniques are
evaluated: (i) the widely used multiple logistic regression technique
in the generalized linear modelling framework, and (ii) a recently
developed machine learning technique called �random forests�. The
latter is an ensemble learning technique that generates many classification
trees and aggregates the individual results. The two different techniques
are used to develop distribution models to predict the vegetation
type occurrence of 11 groundwater-dependent vegetation types in Belgian
lowland valley ecosystems based on spatially distributed measurements
of environmental conditions. The spatially distributed data set under
investigation consists of 1705 grid cells covering an area of 47.32
ha. After model construction and calibration, both models are applied
to independent test data sets using two-fold cross-validation and
resulting probabilities of occurrence are used to predict vegetation
type distributions within the study area. Predicted vegetation types
are compared with observations, and the McNemar test indicates an
overall better performance of the random forest model at the 0.001
significance level. Comparison of the modelling results for each
individual vegetation type separately by means of the F-measure,
which combines precision and recall, also reveals better predictions
by the random forest model. Inspection of the probabilities of occurrence
of the different vegetation types for each grid cell demonstrates
that correct predictions in central areas of homogeneous vegetation
sites are based on high probabilities, whereas the confidence decreases
towards the margins of these areas. Threshold-independent evaluation
of the model accuracy by means of the area under the receiver operating
characteristic (ROC) curves confirms good performances of both models,
but with higher values for the random forest model. Therefore, the
incorporation of the random forest technique in distribution models
has the ability to lead to better model performances.
%0 Journal Article
%1 Peters2007
%A Peters, Jan
%A De Baets, Bernard
%A Verhoest, Niko E.C.
%A Samson, Roeland
%A Degroeve, Sven
%A De Becker, Piet
%A Huybrechts, Willy
%D 2007
%J Ecological Modelling
%K imported
%P 304-318
%T Random forests as a tool for ecohydrological distribution modelling
%V 207
%X An important issue in ecohydrological research is distribution modelling,
aiming at the prediction of species or vegetation type occurrence
on the basis of empirical relations with hydrological or hydrogeochemical
habitat conditions. In this study, two statistical techniques are
evaluated: (i) the widely used multiple logistic regression technique
in the generalized linear modelling framework, and (ii) a recently
developed machine learning technique called �random forests�. The
latter is an ensemble learning technique that generates many classification
trees and aggregates the individual results. The two different techniques
are used to develop distribution models to predict the vegetation
type occurrence of 11 groundwater-dependent vegetation types in Belgian
lowland valley ecosystems based on spatially distributed measurements
of environmental conditions. The spatially distributed data set under
investigation consists of 1705 grid cells covering an area of 47.32
ha. After model construction and calibration, both models are applied
to independent test data sets using two-fold cross-validation and
resulting probabilities of occurrence are used to predict vegetation
type distributions within the study area. Predicted vegetation types
are compared with observations, and the McNemar test indicates an
overall better performance of the random forest model at the 0.001
significance level. Comparison of the modelling results for each
individual vegetation type separately by means of the F-measure,
which combines precision and recall, also reveals better predictions
by the random forest model. Inspection of the probabilities of occurrence
of the different vegetation types for each grid cell demonstrates
that correct predictions in central areas of homogeneous vegetation
sites are based on high probabilities, whereas the confidence decreases
towards the margins of these areas. Threshold-independent evaluation
of the model accuracy by means of the area under the receiver operating
characteristic (ROC) curves confirms good performances of both models,
but with higher values for the random forest model. Therefore, the
incorporation of the random forest technique in distribution models
has the ability to lead to better model performances.
@article{Peters2007,
abstract = {An important issue in ecohydrological research is distribution modelling,
aiming at the prediction of species or vegetation type occurrence
on the basis of empirical relations with hydrological or hydrogeochemical
habitat conditions. In this study, two statistical techniques are
evaluated: (i) the widely used multiple logistic regression technique
in the generalized linear modelling framework, and (ii) a recently
developed machine learning technique called �random forests�. The
latter is an ensemble learning technique that generates many classification
trees and aggregates the individual results. The two different techniques
are used to develop distribution models to predict the vegetation
type occurrence of 11 groundwater-dependent vegetation types in Belgian
lowland valley ecosystems based on spatially distributed measurements
of environmental conditions. The spatially distributed data set under
investigation consists of 1705 grid cells covering an area of 47.32
ha. After model construction and calibration, both models are applied
to independent test data sets using two-fold cross-validation and
resulting probabilities of occurrence are used to predict vegetation
type distributions within the study area. Predicted vegetation types
are compared with observations, and the McNemar test indicates an
overall better performance of the random forest model at the 0.001
significance level. Comparison of the modelling results for each
individual vegetation type separately by means of the F-measure,
which combines precision and recall, also reveals better predictions
by the random forest model. Inspection of the probabilities of occurrence
of the different vegetation types for each grid cell demonstrates
that correct predictions in central areas of homogeneous vegetation
sites are based on high probabilities, whereas the confidence decreases
towards the margins of these areas. Threshold-independent evaluation
of the model accuracy by means of the area under the receiver operating
characteristic (ROC) curves confirms good performances of both models,
but with higher values for the random forest model. Therefore, the
incorporation of the random forest technique in distribution models
has the ability to lead to better model performances.},
added-at = {2010-07-07T17:27:19.000+0200},
author = {Peters, Jan and {De Baets}, Bernard and Verhoest, Niko E.C. and Samson, Roeland and Degroeve, Sven and {De Becker}, Piet and Huybrechts, Willy},
biburl = {https://www.bibsonomy.org/bibtex/28b098e9570494c92a5bc81a109ecac85/pillo},
interhash = {a266c54c45f03039c9f5c19c582521e1},
intrahash = {8b098e9570494c92a5bc81a109ecac85},
journal = {Ecological Modelling},
keywords = {imported},
owner = {Bernd Panassiti},
pages = {304-318},
timestamp = {2010-07-07T17:27:23.000+0200},
title = {Random forests as a tool for ecohydrological distribution modelling},
volume = 207,
year = 2007
}