[r-cran-ipred] 01/02: New upstream version 0.9-6
Andreas Tille
tille at debian.org
Sun Oct 22 19:32:15 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-cran-ipred.
commit 1ff96838c6626115c783fbe2716e97be68df59c9
Author: Andreas Tille <tille at debian.org>
Date: Sun Oct 22 21:31:26 2017 +0200
New upstream version 0.9-6
---
DESCRIPTION | 27 +
MD5 | 80 +++
NAMESPACE | 88 +++
R/bagging.R | 33 ++
R/bootest.R | 267 +++++++++
R/checkfunArgs.R | 33 ++
R/csurv.R | 71 +++
R/cv.R | 263 ++++++++
R/errorest.R | 155 +++++
R/inbagg.R | 241 ++++++++
R/inclass.R | 190 ++++++
R/ipredbagg.R | 266 +++++++++
R/ipredknn.R | 69 +++
R/irpart.R | 57 ++
R/kfoldcv.R | 21 +
R/mypredict.lm.R | 32 +
R/predict.bagging.R | 267 +++++++++
R/predict.inbagg.R | 30 +
R/predict.inclass.R | 37 ++
R/predict.irpart.R | 54 ++
R/print.R | 218 +++++++
R/prune.bagging.R | 23 +
R/rsurv.R | 56 ++
R/sbrier.R | 151 +++++
R/slda.R | 112 ++++
R/ssubset.R | 39 ++
R/varset.R | 26 +
build/vignette.rds | Bin 0 -> 236 bytes
cleanup | 29 +
data/DLBCL.rda | Bin 0 -> 4067 bytes
data/GlaucomaMVF.rda | Bin 0 -> 19144 bytes
data/Smoking.rda | Bin 0 -> 1314 bytes
data/dystrophy.rda | Bin 0 -> 3173 bytes
inst/COPYRIGHTS | 26 +
inst/NEWS | 285 +++++++++
inst/doc/ipred-examples.R | 144 +++++
inst/doc/ipred-examples.Rnw | 401 +++++++++++++
inst/doc/ipred-examples.pdf | Bin 0 -> 81843 bytes
man/DLBCL.Rd | 57 ++
man/GlaucomaMVF.Rd | 134 +++++
man/Smoking.Rd | 38 ++
man/bagging.Rd | 238 ++++++++
man/bootest.Rd | 48 ++
man/control.errorest.Rd | 33 ++
man/cv.Rd | 46 ++
man/dystrophy.Rd | 59 ++
man/errorest.Rd | 257 ++++++++
man/inbagg.Rd | 112 ++++
man/inclass.Rd | 116 ++++
man/ipred-internal.Rd | 13 +
man/ipredknn.Rd | 48 ++
man/kfoldcv.Rd | 38 ++
man/mypredict.lm.Rd | 28 +
man/predict.bagging.Rd | 85 +++
man/predict.inbagg.Rd | 57 ++
man/predict.inclass.Rd | 119 ++++
man/predict.ipredknn.Rd | 29 +
man/predict.slda.Rd | 29 +
man/print.bagging.Rd | 21 +
man/print.cvclass.Rd | 24 +
man/print.inbagg.Rd | 18 +
man/print.inclass.Rd | 18 +
man/prune.bagging.Rd | 38 ++
man/rsurv.Rd | 55 ++
man/sbrier.Rd | 183 ++++++
man/slda.Rd | 96 +++
man/summary.bagging.Rd | 23 +
man/summary.inbagg.Rd | 30 +
man/summary.inclass.Rd | 31 +
man/varset.Rd | 58 ++
src/SdiffKM.c | 94 +++
src/init.c | 23 +
tests/Examples/ipred-Ex.Rout.save | 1188 +++++++++++++++++++++++++++++++++++++
tests/ipred-bugs.R | 51 ++
tests/ipred-bugs.Rout.save | 185 ++++++
tests/ipred-segfault.R | 136 +++++
tests/ipred-segfault.Rout.save | 447 ++++++++++++++
tests/ipred-smalltest.R | 34 ++
tests/ipred-smalltest.Rout.save | 55 ++
vignettes/ipred-examples.Rnw | 401 +++++++++++++
vignettes/ipred.bib | 73 +++
81 files changed, 8657 insertions(+)
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..bf3830e
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,27 @@
+Package: ipred
+Title: Improved Predictors
+Version: 0.9-6
+Date: 2017-03-01
+Authors at R: c(person("Andrea", "Peters", role = "aut"),
+ person("Torsten", "Hothorn", role = c("aut", "cre"),
+ email = "Torsten.Hothorn at R-project.org"),
+ person("Brian D.", "Ripley", role = "ctb"),
+ person("Terry", "Therneau", role = "ctb"),
+ person("Beth", "Atkinson", role = "ctb"))
+Description: Improved predictive models by indirect classification and
+ bagging for classification, regression and survival problems
+ as well as resampling based estimators of prediction error.
+Depends: R (>= 2.10)
+Imports: rpart (>= 3.1-8), MASS, survival, nnet, class, prodlim
+Suggests: mvtnorm, mlbench, TH.data
+License: GPL (>= 2)
+NeedsCompilation: yes
+Packaged: 2017-03-01 14:07:15 UTC; hothorn
+Author: Andrea Peters [aut],
+ Torsten Hothorn [aut, cre],
+ Brian D. Ripley [ctb],
+ Terry Therneau [ctb],
+ Beth Atkinson [ctb]
+Maintainer: Torsten Hothorn <Torsten.Hothorn at R-project.org>
+Repository: CRAN
+Date/Publication: 2017-03-01 19:05:51
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..b51af24
--- /dev/null
+++ b/MD5
@@ -0,0 +1,80 @@
+5bdc3b92276c5006cfe8644081d53361 *DESCRIPTION
+15b9a39f2d45f6a431b66ad752f2be5a *NAMESPACE
+bd957f8c9597e280c5bc486db1b50fed *R/bagging.R
+c55f9e65194e69cac7b3f6a14819f7ee *R/bootest.R
+d8b949efde7413d8d639a64be6528403 *R/checkfunArgs.R
+85963ba4287c3ffd4c84dbe8a48d9c28 *R/csurv.R
+16fcd24ff394cdec7d929f8acacf5823 *R/cv.R
+b17e241d115e8e9491c5e07e612295fc *R/errorest.R
+53ec8b284293a7942d4b31f27c11eb6e *R/inbagg.R
+9bc7ff831e43ae443c3afd0015f9b45e *R/inclass.R
+96677befab2b4535bc23d755ad5f6c92 *R/ipredbagg.R
+d22859a559e876b2e478d366b705d0a1 *R/ipredknn.R
+a98cc646bc7f7a9173d511917bc2db34 *R/irpart.R
+6100af39d915430ab680a3889d52d4ec *R/kfoldcv.R
+7d64c8043f1426fe3f41e324dad74be6 *R/mypredict.lm.R
+4abb96da501e39909f71569ba8b1b85b *R/predict.bagging.R
+1714695c8d4dd8439fc6d1e3f7968e1c *R/predict.inbagg.R
+301f8e08bee09b9817e8ffdce0a4c398 *R/predict.inclass.R
+8b4b9b3039c4fb86d2c96555b32872a8 *R/predict.irpart.R
+5427f43c2b21d2e7f3c6ff2fc521c33a *R/print.R
+7fff7c589cfae0b7e57f6af2bf6974f4 *R/prune.bagging.R
+ea8bb0575e5ee83b6a4d595212906f83 *R/rsurv.R
+87f1047a58009264131f1a5928726b21 *R/sbrier.R
+bb92fccd557c101dc3f52a3c7a7a1961 *R/slda.R
+51c9a057a29e6be4477aaf23bcce4000 *R/ssubset.R
+860d6ab9df7627b349a2fd9ac71237fe *R/varset.R
+805d97840550912b17fa3d6e6484ee17 *build/vignette.rds
+f83cb6bdc4e6265a64be0914da7979f6 *cleanup
+b77f49ce74f90948e4d09122c5fac724 *data/DLBCL.rda
+b07616370b51419752d4219f1f4f9f55 *data/GlaucomaMVF.rda
+1f87b4f0d914888b1be71028fef8d316 *data/Smoking.rda
+e54b730797d71b1483cc26bfb3ea552b *data/dystrophy.rda
+45a8a599f130fd94e2bf0ccea723a290 *inst/COPYRIGHTS
+3bca94074b3840c2411c6c49d0f95c66 *inst/NEWS
+82eeec327b400aadc3c7fe0905483d8a *inst/doc/ipred-examples.R
+72c8610d330afc2376692437ffd0b5e0 *inst/doc/ipred-examples.Rnw
+441f1fbdc216c684a08449eeb56ff18b *inst/doc/ipred-examples.pdf
+90287c34e610475e251c8b01241ca1ad *man/DLBCL.Rd
+3cf72f2749b7029a0b8b685461d99d3c *man/GlaucomaMVF.Rd
+c7a9f9b81fb440185fccab54c289864e *man/Smoking.Rd
+4fc61476e445db2e8930b95469aaafc0 *man/bagging.Rd
+f4694ae7448c30a49814bc23fee777a7 *man/bootest.Rd
+920bf08095b01ae4d943fbff21eedb57 *man/control.errorest.Rd
+b9874254d1a440ce7d1373ddb88ed18b *man/cv.Rd
+54ef878e658ab5d41e3a54642ca9a451 *man/dystrophy.Rd
+ebb22088953b88f1a62151d295fc2ebd *man/errorest.Rd
+ca29c817b98efbef404f5b8467c796c3 *man/inbagg.Rd
+ceeaae7b39770e87036bd86c4b899a92 *man/inclass.Rd
+d34608ac849321130774fc768f28c475 *man/ipred-internal.Rd
+86ea0f01231c482541c1ae86fa6f1652 *man/ipredknn.Rd
+37eee681bff7ad10df28bb18eab6cf2e *man/kfoldcv.Rd
+941c6c8d91a74ae8a046811346664b0b *man/mypredict.lm.Rd
+d1705bf690f5e92abf7123ca0e0ad2b7 *man/predict.bagging.Rd
+b3dbc86c0c9cab4b549b162b75248c31 *man/predict.inbagg.Rd
+b80ad2198613b405333aef1e6e3cc504 *man/predict.inclass.Rd
+0da9ab0fcef3c03dc6f04309c8830b83 *man/predict.ipredknn.Rd
+d45eeb09998f42d52f180a46296b9744 *man/predict.slda.Rd
+d02437884a49f5b9937791ed6d07c53b *man/print.bagging.Rd
+755e22b8d9799ff3d55f7364be332931 *man/print.cvclass.Rd
+b1ae5acecd41d145898b794567a38231 *man/print.inbagg.Rd
+0e78f227c282e6474a5b504a74b97fe2 *man/print.inclass.Rd
+8903ad93aa5f9f092faab0db112733bd *man/prune.bagging.Rd
+d88328ca9e52b501e01d1b291de8f16d *man/rsurv.Rd
+f617299850199477617f8c80f9967fae *man/sbrier.Rd
+ac34c8ccf9d10e1e15e7a2138c14b2cb *man/slda.Rd
+9000c36afc63d72103df0e5c41dfffc5 *man/summary.bagging.Rd
+dd36ca065d305401e0326680b5cda910 *man/summary.inbagg.Rd
+8ce9a1f1379d492df5aea55687b6b95c *man/summary.inclass.Rd
+79fed002bac6fba4b49458b352873e8c *man/varset.Rd
+0ac59b9f3655966e0fb52ca8a8b2b27a *src/SdiffKM.c
+4c8242f0f0243ec116d9d1dd8ed99150 *src/init.c
+6fd69d16ba74463298302132085589d5 *tests/Examples/ipred-Ex.Rout.save
+829483c0e5b37b6bae902fd8cb625b7c *tests/ipred-bugs.R
+2d426b88005c657aa4746d841e3d369e *tests/ipred-bugs.Rout.save
+fbbdb49957b86b7f24491f38e341dace *tests/ipred-segfault.R
+a09567709b1965ea2a2bb18942e15fcf *tests/ipred-segfault.Rout.save
+7d770b8d05d185fd791cadccce2a6795 *tests/ipred-smalltest.R
+764f1025948d850298686b2f4f5277eb *tests/ipred-smalltest.Rout.save
+72c8610d330afc2376692437ffd0b5e0 *vignettes/ipred-examples.Rnw
+c642c366927d8bf2522e3c078f6e34a2 *vignettes/ipred.bib
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..486c88c
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,88 @@
+useDynLib(ipred, .registration = TRUE)
+
+### exported functions
+export(
+getsurv,
+bagging,
+ipredbagg,
+control.errorest,
+errorest,
+bootest,
+cv,
+inbagg,
+inclass,
+ipredknn,
+predict.ipredknn,
+kfoldcv, mypredict.lm, rsurv, sbrier, varset,
+slda)
+
+importFrom("stats", "as.formula", "complete.cases", "cov",
+ "delete.response", "model.extract", "model.frame",
+ "model.matrix", "na.omit", "naresid", "predict", "rexp",
+ "rnorm", "runif", "terms")
+importFrom("utils", "data", "getFromNamespace")
+
+### register S3 methods
+#S3method(print, bagging)
+S3method(slda, default)
+S3method(slda, formula)
+S3method(slda, factor)
+S3method(bagging, default)
+S3method(bagging, data.frame)
+S3method(errorest, default)
+S3method(errorest, data.frame)
+S3method(bootest, default)
+S3method(bootest, integer)
+S3method(bootest, factor)
+S3method(bootest, numeric)
+S3method(bootest, Surv)
+S3method(inbagg, default)
+S3method(inbagg, data.frame)
+S3method(inclass, default)
+S3method(inclass, data.frame)
+S3method(cv, default)
+S3method(cv, integer)
+S3method(cv, factor)
+S3method(cv, numeric)
+S3method(cv, Surv)
+S3method(ipredbagg, Surv)
+S3method(ipredbagg, factor)
+S3method(ipredbagg, numeric)
+S3method(prune, classbagg)
+S3method(prune, survbagg)
+S3method(prune, classbagg)
+S3method(print, classbagg)
+S3method(print, regbagg)
+S3method(print, survbagg)
+S3method(print, classbagg)
+S3method(summary, classbagg)
+S3method(summary, survbagg)
+S3method(summary, classbagg)
+S3method(print, cvclass)
+S3method(print, bootestclass)
+S3method(print, cvreg)
+S3method(print, bootestreg)
+S3method(print, cvsurv)
+S3method(print, bootestsurv)
+S3method(print, inbagg)
+S3method(summary, inbagg)
+S3method(print, inclass)
+S3method(summary, inclass)
+S3method(predict, inbagg)
+S3method(predict, inclass)
+S3method(predict, classbagg)
+S3method(predict, regbagg)
+S3method(predict, survbagg)
+S3method(predict, slda)
+S3method(predict, ipredknn)
+#S3method(predict, sclass)
+#S3method(predict, sreg)
+
+
+##imported functions
+importFrom(class, knn)
+importFrom(rpart, rpart, prune, na.rpart, rpart.control)
+importFrom(MASS, lda)
+importFrom(survival, Surv, is.Surv, survfit)
+importFrom(nnet, multinom)
+importFrom(prodlim, prodlim)
diff --git a/R/bagging.R b/R/bagging.R
new file mode 100644
index 0000000..81250c2
--- /dev/null
+++ b/R/bagging.R
@@ -0,0 +1,33 @@
+# $Id: bagging.R,v 1.19 2005/06/29 08:50:28 hothorn Exp $
+
+bagging <- function(formula, data, ...) UseMethod("bagging", data)
+
+bagging.default <- function(formula, data, ...)
+ stop(paste("Do not know how to handle objects of class", class(data)))
+
+bagging.data.frame <-
+function(formula, data, subset, na.action=na.rpart, ...)
+{
+ cl <- match.call()
+ if(missing(formula)
+ || (length(formula) != 3)
+ || (length(attr(terms(formula[-2], data = data), "term.labels")) < 1)
+ || (length(attr(terms(formula[-3], data = data), "term.labels")) != 1))
+ stop("formula missing or incorrect")
+ m <- match.call(expand.dots = FALSE)
+ if(is.matrix(eval(m$data, parent.frame())))
+ m$data <- as.data.frame(data)
+ m[[1]] <- as.name("model.frame")
+ m$... <- NULL
+ mf <- eval(m, parent.frame())
+ response <- attr(attr(mf, "terms"), "response")
+ # just extract the data.frame, no handling of contrasts or NA's here.
+ # this is done by rpart or the user supplied methods
+
+ DATA <- list(y = mf[,response], X = mf[,-response, drop = FALSE])
+ names(DATA) <- c("y", "X")
+ y <- do.call("ipredbagg", c(DATA, list(...)))
+ y$call <- cl
+ return(y)
+}
+
diff --git a/R/bootest.R b/R/bootest.R
new file mode 100644
index 0000000..5b33d52
--- /dev/null
+++ b/R/bootest.R
@@ -0,0 +1,267 @@
+# $Id: bootest.R,v 1.18 2004/02/09 08:08:21 peters Exp $
+
+bootest <- function(y, ...) {
+ if(is.null(class(y)))
+ class(y) <- data.class(y)
+ UseMethod("bootest", y)
+}
+
+bootest.default <- function(y, ...) {
+ stop(paste("Do not know how to handle objects of class", class(y)))
+}
+
+bootest.integer <- function(y, ...) {
+ bootest.numeric(y, ...)
+}
+
+bootest.factor <- function(y, formula, data, model, predict,
+ nboot=25, bc632plus = FALSE, list.tindx = NULL, predictions = FALSE, both.boot = FALSE, ...) {
+
+ # bootstrap estimator of misclassification error
+
+ N <- length(y)
+ nindx <- 1:N
+ if(!is.null(list.tindx)) nboot <- length(list.tindx)
+
+ bootindx <- matrix(NA, ncol=nboot, nrow=N)
+ if(predictions) {
+ BOOTINDX <- data.frame(matrix(NA, ncol=nboot, nrow=N))
+ }
+
+ classes <- levels(y)
+ USEPM <- FALSE
+
+ if(!is.data.frame(data)) stop("data is not a data.frame")
+ if(nboot <=2) stop("to small number of bootstrap replications")
+ if(is.null(nboot)) stop("number of bootstrap replications is missing")
+ if(!is.null(list.tindx) & length(list.tindx) != nboot) stop(paste("List of selected observations per bootstrap sample has to be", nboot))
+
+ for(i in 1:nboot) {
+ if(!is.null(list.tindx)) {
+ tindx <- list.tindx[[i]]
+ if(length(tindx) > N) warning("number of selected observations is larger than the sample size")
+ } else {
+ tindx <- sample(nindx, N, replace = TRUE)
+ }
+
+ mymodel <- model(formula, data = data[tindx,], ...)
+
+ # check if mymodel is a function which should be used instead of
+ # predict
+ if (is.function(mymodel)) {
+ if(!is.null(predict) & i == 1)
+ warning("model returns a function and predict is specified, using models output")
+ predict <- mymodel
+ USEPM <- TRUE
+ }
+
+ if (USEPM)
+ pred <- predict(newdata=data)
+ else
+ pred <- predict(mymodel, newdata = data)
+ if (!is.factor(pred)) stop("predict does not return factor values")
+ pred <- factor(pred, levels=classes)[-tindx]
+ if (length(pred) != length(y[-tindx]))
+ stop("different length of data and prediction")
+ if(predictions) {
+ BOOTINDX[,i] <- factor(BOOTINDX[,i],levels = classes)
+ BOOTINDX[-tindx, i] <- pred
+ }
+ bootindx[-tindx, i] <- (pred != y[-tindx])
+ }
+
+ fun <- function(x)
+ ifelse(all(is.na(x)), NA, mean(as.integer(x), na.rm = TRUE))
+
+ one <- mean(apply(bootindx, 1, fun), na.rm = TRUE)
+
+ if (bc632plus) {
+ full.model <- model(formula, data = data, ...)
+ # check if full.model is a function which should be used instead of
+ # predict
+ if (is.function(full.model)) {
+ predict <- full.model
+ USEPM <- TRUE
+ }
+
+ if (USEPM)
+ full.pred <- predict(newdata=data)
+ else
+
+ full.pred <- predict(full.model, newdata = data)
+ resubst <- mean(full.pred != y, na.rm = TRUE)
+
+ err632 <- 0.368*resubst + 0.632*one
+
+ y <- y[!is.na(y) & !is.na(full.pred)]
+ full.pred <- full.pred[!is.na(y) & !is.na(full.pred)]
+ gamma <- sum(outer(y, full.pred, function(x, y) ifelse(x==y, 0, 1) ))/
+ (length(y)^2)
+ r <- (one - resubst)/(gamma - resubst)
+ r <- ifelse(one > resubst & gamma > resubst, r, 0)
+ errprime <- min(one, gamma)
+ # weight <- .632/(1-.368*r)
+ # err <- (1-weight)*resubst + weight*one
+ err <- err632 + (errprime - resubst)*(0.368*0.632*r)/(1-0.368*r)
+ if(predictions)
+ RET <- list(error = err, nboot = nboot, bc632plus = TRUE, predictions = BOOTINDX)
+ else
+ RET <- list(error = err, nboot=nboot, bc632plus = TRUE)
+ if(both.boot){
+ bc632plus <- FALSE
+ RETbc <- RET
+ }
+ }
+
+ if(!bc632plus) {
+ err <- one
+ expb <- rep(0, nboot)
+ for(i in 1:nboot)
+ expb[i] <- mean(apply(bootindx[,-i], 1, fun), na.rm = TRUE)
+
+ sdint <- sqrt( ((nboot - 1)/nboot)*sum((expb - mean(expb))^2) )
+ if(predictions)
+ RET <- list(error = err, sd = sdint, bc632plus = FALSE, nboot = nboot, predictions = BOOTINDX)
+ else
+ RET <- list(error = err, sd=sdint, bc632plus=FALSE, nboot=nboot)
+ if(both.boot){
+ RET <- list("boot" = RET, "632plus" = RETbc)
+ }
+ }
+ class(RET) <- "bootestclass"
+ RET
+}
+
+bootest.numeric <- function(y, formula, data, model, predict,
+ nboot=25, bc632plus=FALSE, list.tindx = NULL, predictions = FALSE, ...) {
+
+ # bootstrap estimator of root of mean squared error
+
+ if (bc632plus) stop("cannot compute 632+ estimator of mean squared error")
+ if(!is.null(list.tindx)) nboot <- length(list.tindx)
+ if (nboot <=2) stop("to small number of bootstrap replications")
+
+ ##FIX: nrow =
+ N <- length(y)
+ nindx <- 1:N
+
+ bootindx <- matrix(NA, ncol=nboot, nrow=N)
+ if(predictions) BOOTINDX <- matrix(NA, ncol=nboot, nrow=N)
+ USEPM <- FALSE
+
+ if (!is.data.frame(data)) stop("data is not a data.frame")
+ if(is.null(nboot)) stop("number of bootstrap replications is missing")
+ if(!is.null(list.tindx) & length(list.tindx) != nboot) stop(paste("List of selected observations per bootstrap sample has to be", nboot))
+
+
+ for(i in 1:nboot) {
+ if(!is.null(list.tindx)) {
+ tindx <- list.tindx[[i]]
+ if(length(tindx) > N) warning("number of selected observations is larger than the sample size")
+ } else {
+ tindx <- sample(nindx, N, replace = TRUE)
+ }
+# tindx <- ifelse(!is.null(list.tindx), list.tindx[[i]], sample(nindx, N, replace = TRUE))
+ mymodel <- model(formula, data = data[tindx,], ...)
+ outbootdata <- subset(data, !(nindx %in% tindx))
+ # check if mymodel is a function which should be used instead of
+ # predict
+ if (is.function(mymodel)) {
+ if(!is.null(predict) & i == 1)
+ warning("model returns a function and predict is specified, using models output")
+ predict <- mymodel
+ USEPM <- TRUE
+ }
+
+ if (USEPM)
+ pred <- predict(newdata=outbootdata)
+ else
+ pred <- predict(mymodel, newdata = outbootdata)
+ if (!is.numeric(pred)) stop("predict does not return numerical values")
+ if (length(pred) != length(y[-tindx]))
+ stop("different length of data and prediction")
+ if(predictions) BOOTINDX[-tindx, i] <- pred
+ bootindx[-tindx, i] <- (pred - y[-tindx])^2
+ }
+
+ fun <- function(x)
+ ifelse(all(is.na(x)), NA, mean(x, na.rm = TRUE))
+
+ err <- sqrt(mean(apply(bootindx, 1, fun), na.rm = TRUE))
+ if(predictions)
+ RET <- list(error = err, nboot = nboot, predictions = BOOTINDX)
+ else
+ RET <- list(error = err, nboot=nboot)
+ class(RET) <- "bootestreg"
+ RET
+}
+
+bootest.Surv <- function(y, formula, data=NULL, model, predict,
+ nboot=25, bc632plus=FALSE, list.tindx = NULL, predictions = FALSE, ...) {
+
+ # bootstrap estimator of Brier's score
+
+ if (bc632plus) stop("cannot compute 632+ estimator of Brier's score")
+
+ N <- dim(y)[1]
+ if(!is.null(list.tindx)) nboot <- length(list.tindx)
+ nindx <- 1:N
+
+ bootindx <- matrix(NA, ncol=nboot, nrow=N)
+ if(predictions) BOOTINDX <- matrix(NA, ncol=nboot, nrow=N)
+ USEPM <- FALSE
+
+ if(is.null(nboot)) stop("number of bootstrap replications is missing")
+ if (nboot <=2) stop("to small number of bootstrap replications")
+ if (is.null(data)) data <- as.data.frame(rep(1, N))
+ if (!is.data.frame(data)) stop("data is not a data.frame")
+ if(!is.null(list.tindx)) nboot <- length(list.tindx)
+
+
+ for(i in 1:nboot) {
+ if(!is.null(list.tindx)) {
+ tindx <- list.tindx[[i]]
+ if(tindx > N) warning("number of selected observations is larger than the sample size")
+ } else {
+ tindx <- sample(nindx, N, replace = TRUE)
+ }
+ #tindx <- ifelse(!is.null(list.tindx), list.tindx[[i]], sample(nindx, N, replace = TRUE))
+ #tindx <- sample(nindx, N, replace = TRUE)
+ mymodel <- model(formula, data=data[tindx,], ...)
+ outbootdata <- subset(data, !(nindx %in% tindx))
+ # check if mymodel is a function which should be used instead of
+ # predict
+ if (is.function(mymodel)) {
+ if(!is.null(predict) & i == 1)
+ warning("model returns a function and predict is specified, using models output")
+ predict <- mymodel
+ USEPM <- TRUE
+ }
+
+ if (USEPM)
+ pred <- predict(newdata=outbootdata)
+ else
+ pred <- predict(mymodel, newdata = outbootdata)
+
+ if (is.list(pred)) {
+ if (!inherits(pred[[1]], "survfit") && !inherits(pred, "survfit"))
+ stop("predict does not return a list of survfit objects")
+ } else {
+ stop("predict does not return a list of survfit objects")
+ }
+ if(predictions) BOOTINDX[-tindx, i] <- sbrier(y[-tindx], pred) ###???
+ bootindx[-tindx, i] <- sbrier(y[-tindx], pred)
+ }
+
+ fun <- function(x)
+ ifelse(all(is.na(x)), NA, mean(x, na.rm = TRUE))
+
+ err <- mean(apply(bootindx, 1, fun), na.rm = TRUE)
+ if(predictions)
+ RET <- list(error = err, nboot = nboot, predictions = BOOTINDX)
+ else
+ RET <- list(error = err, nboot=nboot)
+ class(RET) <- "bootestsurv"
+ RET
+}
+
diff --git a/R/checkfunArgs.R b/R/checkfunArgs.R
new file mode 100644
index 0000000..bf1647a
--- /dev/null
+++ b/R/checkfunArgs.R
@@ -0,0 +1,33 @@
+# $Id: checkfunArgs.R,v 1.1 2003/02/17 09:49:31 hothorn Exp $
+
+checkfunArgs <- function(fun, type=c("model", "predict")) {
+
+ # check for appropriate arguments of user-supplied function "fun"
+ # this will not work for generics in R < 1.7.0 and therefore not used by
+ # now
+
+ type <- match.arg(type)
+
+ if (!is.function(fun)) {
+ warning("fun is not a function")
+ return(FALSE)
+ }
+
+ funargs <- formals(fun)
+
+ switch(type, "model"={
+ if (!all(names(funargs)[1:2] %in% c("formula", "data"))) {
+ warning("fun is not a function with at least 'formula' and 'data' arguments")
+ return(FALSE)
+ } else {
+ return(TRUE)
+ }
+ }, "predict"={
+ if (length(funargs) < 2) {
+ warnings("fun is not a function with at least 'object' and 'newdata' arguments")
+ return(FALSE)
+ } else {
+ return(TRUE)
+ }
+ })
+}
diff --git a/R/csurv.R b/R/csurv.R
new file mode 100644
index 0000000..159f814
--- /dev/null
+++ b/R/csurv.R
@@ -0,0 +1,71 @@
+# $Id: csurv.R,v 1.6 2003/03/28 12:55:32 hothorn Exp $
+
+csurv <- function(newdata, pred, minprob=0, window=0.0001) {
+
+ N <- nrow(newdata)
+ if (!"hazard" %in% names(attributes(newdata)))
+ stop("hazards attribute to newdata missing")
+ hazards <- attr(newdata, "hazard")
+
+ error <- rep(0, N)
+
+ # if there is only one prediction for all observations
+ GETPROB <- TRUE
+ if (inherits(pred, "survfit")) {
+ times <- pred$time # get times
+ predprob <- getsurv(pred, times) # get steps
+ GETPROB <- FALSE
+ }
+
+ for (i in 1:N) {
+ if (GETPROB) {
+ times <- pred[[i]]$time # get times
+ predprob <- getsurv(pred[[i]], times) # get steps
+ }
+ # compute the integrated squared difference between
+ # KM and S(t)
+ # minprob: stop integration when S(t) < minprob
+ lasttime <- -(log(minprob) / hazards[i])
+ if (max(times) > lasttime) {
+ thisprob <- predprob[times <= lasttime]
+ thistimes <- times[times <= lasttime]
+ } else {
+ thisprob <- predprob
+ thistimes <- times
+ }
+ error[i] <- .Call(SdiffKM, as.double(c(0,thistimes)),
+ as.double(c(1,thisprob)),
+ as.double(c(hazards[i], window)))
+ # adjust for time scale by last event
+ error[i] <- error[i]/max(thistimes)
+ if (length(unique(hazards)) == 1) {
+ error <- error[i]
+ break
+ }
+ }
+ error <- mean(error)
+ error
+}
+
+foo <- function (time, prob, hazard, window)
+{
+ myint <- 0
+ time <- c(0, time)
+ s <- exp(-time * hazard)
+ prob <- c(1, prob)
+ for (i in 1:(length(time)-1)) {
+ d <- time[i+1] - time[i]
+ if (d < window) {
+ myint <- myint + 0.5 * d * ((prob[i] - s[i])^2 +
+ (prob[i] - s[i + 1])^2)
+ }
+ else {
+ k <- ceiling(d/window)
+ wi <- d/k
+ for (j in 1:k) myint <- myint + 0.5 * wi * ((prob[i] -
+ exp(-(time[i] + (j - 1) * wi) * hazard))^2 +
+ (prob[i] - exp(-(time[i] + j * wi) * hazard))^2)
+ }
+ }
+ myint
+}
diff --git a/R/cv.R b/R/cv.R
new file mode 100644
index 0000000..201a8c7
--- /dev/null
+++ b/R/cv.R
@@ -0,0 +1,263 @@
+#$Id: cv.R,v 1.21 2004/02/11 09:13:51 peters Exp $
+
+cv <- function(y, ...) {
+ if(is.null(class(y)))
+ class(y) <- data.class(y)
+ UseMethod("cv", y)
+}
+
+cv.default <- function(y, ...) {
+ stop(paste("Do not know how to handle objects of class", class(y)))
+}
+
+cv.integer <- function(y, ...) {
+ cv.numeric(y, ...)
+}
+
+cv.factor <- function(y, formula, data, model, predict, k=10, random=TRUE,
+ strat=FALSE, predictions=NULL, getmodels=NULL, list.tindx = NULL, ...) {
+
+ # k-fold cross-validation of misclassification error
+
+ if (!is.data.frame(data)) stop("data is not of class data.frame")
+
+ N <- length(y)
+ classes <- levels(y)
+
+ if (is.null(k)) k <- 10
+ if (is.null(random)) random <- TRUE
+ if (is.null(strat)) strat <- FALSE
+ if (is.null(predictions)) predictions <- FALSE
+ if (is.null(getmodels)) getmodels <- FALSE
+ USEPM <- FALSE
+
+ if(!is.null(list.tindx)) k <- length(list.tindx)
+ if(!is.null(list.tindx)) {
+ random <- FALSE
+ }
+
+ # to reproduce results, either use `set.seed' or a fixed partition of
+ # the samples
+ if (random)
+ myindx <- sample(1:N, N)
+ else
+ myindx <- 1:N
+
+ y <- y[myindx]
+ data <- data[myindx,]
+
+ # determine an appropriate splitting for the sample size into
+ # k roughly equally sized parts
+
+ mysplit <- ssubset(y, k, strat=strat)
+ allpred <- vector(mode="character", length=N)
+ fu <- function(x) levels(x)[as.integer(x)]
+ nindx <- 1:N
+
+ if (getmodels)
+ models <- vector(k, mode="list")
+
+ for(i in 1:k) {
+
+ if(!is.null(list.tindx)) {
+ tindx <- list.tindx[[i]]
+ } else {
+ tindx <- mysplit[[i]]
+ }
+
+ folddata <- subset(data, !(nindx %in% tindx))
+ mymodel <- model(formula, data=folddata, ...)
+ if (getmodels) models[[i]] <- mymodel
+
+ # check of mymodel is a function which should be used instead of
+ # predict
+ if (is.function(mymodel)) {
+ if(!is.null(predict) & i == 1)
+ warning("model returns a function and predict is specified, using models output")
+ predict <- mymodel
+ USEPM <- TRUE
+ }
+
+ # we assume predict to return factor levels
+ if (USEPM)
+ pred <- predict(newdata=data)
+ else
+ pred <- predict(mymodel, newdata = data)
+ if (!is.factor(pred)) stop("predict does not return factor values")
+ pred <- factor(pred, levels=classes)
+
+ # <FIXME>
+ # there is no c() for factors which preserves the levels, isn't it?
+ # use characters
+ allpred[tindx] <- fu(pred[tindx])
+ # </FIXME>
+ }
+ allpred <- factor(allpred, levels=classes)
+ allpred <- allpred[order(myindx)]
+ err <- mean(allpred != y[order(myindx)], na.rm = TRUE)
+ if (predictions)
+ RET <- list(error = err, k = k, predictions=allpred)
+ else
+ RET <- list(error = err, k = k)
+ if (getmodels)
+ RET <- c(RET, models=list(models))
+ class(RET) <- "cvclass"
+ RET
+}
+
+cv.numeric <- function(y, formula, data, model, predict, k=10, random=TRUE,
+ predictions=NULL, strat=NULL, getmodels=NULL, list.tindx = NULL, ...) {
+
+ # k-fold cross-validation of mean squared error
+
+ if (!is.data.frame(data)) stop("data is not of class data.frame")
+ if(!is.null(list.tindx)) k <- length(list.tindx)
+ N <- length(y)
+
+ if (is.null(k)) k <- 10
+ if (is.null(random)) random <- TRUE
+ if (is.null(predictions)) predictions <- FALSE
+ if (is.null(getmodels)) getmodels <- FALSE
+ USEPM <- FALSE
+ # determine an appropriate splitting for the sample size into
+ # k roughly equally sized parts
+
+# if(is.null(list.tindx)) {
+ a <- kfoldcv(k, N)
+ # to reproduce results, either use `set.seed' or a fixed partition of
+ # the samples
+ if (random)
+ myindx <- sample(1:N, N)
+ else
+ myindx <- 1:N
+ nindx <- 1:N
+# }
+
+ if (getmodels)
+ models <- vector(k, mode="list")
+
+ allpred <- rep(0, N)
+ for(i in 1:k) {
+ if(!is.null(list.tindx)) {
+ tindx <- list.tindx[[i]]
+ } else {
+ if (i > 1)
+ tindx <- myindx[(sum(a[1:(i-1)])+1):sum(a[1:i])]
+ else
+ tindx <- myindx[1:a[1]]
+ }
+ folddata <- subset(data, !(nindx %in% tindx))
+ mymodel <- model(formula, data=folddata, ...)
+ if (getmodels) models[[i]] <- mymodel
+
+ # check of mymodel is a function which should be used instead of
+ # predict
+ if (is.function(mymodel)) {
+ if(!is.null(predict) & i == 1)
+ warning("model returns a function and predict is specified, using models output")
+ predict <- mymodel
+ USEPM <- TRUE
+ }
+
+ outfolddata <- subset(data, nindx %in% tindx)
+ if (USEPM)
+ pred <- predict(newdata=outfolddata)
+ else
+ pred <- predict(mymodel, newdata = outfolddata)
+ if (!is.numeric(pred)) stop("predict does not return numerical values")
+ allpred[sort(tindx)] <- pred
+ }
+ err <- sqrt(mean((allpred - y)^2, na.rm = TRUE))
+ if (predictions)
+ RET <- list(error = err, k = k, predictions=allpred)
+ else
+ RET <- list(error = err, k = k)
+ if (getmodels)
+ RET <- c(RET, models=list(models))
+ class(RET) <- "cvreg"
+ RET
+}
+
+cv.Surv <- function(y, formula, data=NULL, model, predict, k=10, random=TRUE,
+ predictions=FALSE, strat=FALSE, getmodels=NULL, list.tindx = NULL, ...) {
+
+ # k-fold cross-validation of Brier's score
+
+ if (is.null(predictions)) predictions <- FALSE
+ if(is.null(random)) random <- TRUE
+ if (is.null(predictions)) predictions <- FALSE
+ if (is.null(strat)) strat <- FALSE
+ if (is.null(getmodels)) getmodels <- FALSE
+ USEPM <- FALSE
+
+ N <- length(y[,1])
+ nindx <- 1:N
+ if(is.null(random)) random <- TRUE
+ if(is.null(k)) k <- 10
+ if (is.null(data)) data <- rep(1, N)
+
+ if(!is.null(list.tindx)) k <- length(list.tindx)
+ if(is.null(k)) stop("k for k-fold cross-validation is missing")
+
+ # determine an appropriate splitting for the sample size into
+ # k roughly equally sized parts
+
+ # if(is.null(list.tindx)) {
+ a <- kfoldcv(k, N)
+ # to reproduce results, either use `set.seed' or a fixed partition of
+ # the samples
+ if (random)
+ myindx <- sample(1:N, N)
+ else
+ myindx <- 1:N
+ # }
+
+ if (getmodels)
+ models <- vector(k, mode="list")
+
+ cverr <- c()
+ for(i in 1:k) {
+ if(!is.null(list.tindx)) {
+ tindx <- list.tindx[[i]]
+ } else {
+ if (i > 1)
+ tindx <- myindx[(sum(a[1:(i-1)])+1):sum(a[1:i])]
+ else
+ tindx <- myindx[1:a[1]]
+ }
+
+ folddata <- subset(data, !(nindx %in% tindx))
+ mymodel <- model(formula, data=folddata, ...)
+ if (getmodels) models[[i]] <- mymodel
+
+ # check if mymodel is a function which should be used instead of
+ # predict
+ if (is.function(mymodel)) {
+ if(!is.null(predict) & i == 1)
+ warning("model returns a function and predict is specified, using models output")
+ predict <- mymodel
+ USEPM <- TRUE
+ }
+
+ outfolddata <- subset(data, (nindx %in% tindx))
+ if (USEPM)
+ pred <- predict(newdata=outfolddata)
+ else
+ pred <- predict(mymodel, newdata = outfolddata)
+ if (is.list(pred)) {
+ if (!inherits(pred[[1]], "survfit") && !inherits(pred, "survfit"))
+ stop("predict does not return a list of survfit objects")
+ } else {
+ stop("predict does not return a list of survfit objects")
+ }
+
+ err <- sbrier(y[sort(tindx)], pred)
+ cverr <- c(cverr,rep(err, length(tindx)))
+ }
+ RET <- list(error = mean(cverr), k=k)
+ if (getmodels)
+ RET <- c(RET, models=list(models))
+ class(RET) <- "cvsurv"
+ RET
+}
+
diff --git a/R/errorest.R b/R/errorest.R
new file mode 100644
index 0000000..baaf22b
--- /dev/null
+++ b/R/errorest.R
@@ -0,0 +1,155 @@
+# $Id: errorest.R,v 1.25 2005/06/29 08:50:28 hothorn Exp $
+
+control.errorest <- function(k= 10, nboot = 25, strat=FALSE,
+ random=TRUE, predictions=FALSE, getmodels=FALSE, list.tindx = NULL) {
+ if (k < 1) {
+ warning("k < 1, using k=10")
+ k <- 10
+ }
+ if (nboot < 1) {
+ warning("nboot < 1, using nboot=25")
+ nboot <- 25
+ }
+ if (!is.logical(strat)) {
+ warning("strat is not a logical, using strat=FALSE")
+ strat <- FALSE
+ }
+ if (!is.logical(random)) {
+ warning("random is not a logical, using random=TRUE")
+ random <- TRUE
+ }
+ if (!is.logical(predictions)) {
+ warning("predictions is not a logical, using predictions=FALSE")
+ predictions <- FALSE
+ }
+
+ if (!is.logical(getmodels)) {
+ warning("getmodel is not a logical, using getmodels=FALSE")
+ getmodels <- FALSE
+ }
+
+ RET <- list(k=k, nboot=nboot, strat=strat, random=random,
+ predictions=predictions, getmodels=getmodels, list.tindx = list.tindx)
+ return(RET)
+}
+
+errorest <- function(formula, data, ...) UseMethod("errorest", data)
+
+errorest.default <- function(formula, data, ...)
+ stop(paste("Do not know how to handle objects of class", class(data)))
+
+errorest.data.frame <- function(formula, data, subset, na.action=na.omit,
+ model=NULL, predict=NULL,
+ estimator = c("cv", "boot", "632plus"),
+ est.para = control.errorest(), ...) {
+
+ cl <- match.call()
+ m <- match.call(expand.dots = FALSE)
+ if (length(grep("inclass", paste(m$model))) > 0 ||
+ length(grep("inbagg", paste(m$model))) > 0) {
+ RET <- errorestinclass(formula, data=data, subset, na.action,
+ model, predict, estimator, est.para, ...)
+ RET$call <- cl
+ } else {
+
+ if(missing(formula)
+ || (length(formula) != 3)
+ || (length(attr(terms(formula[-3], data = data), "term.labels")) != 1))
+ stop("formula missing or incorrect")
+ NOPRED <- (length(attr(terms(formula[-2], data = data), "term.labels")) < 1)
+ if(is.matrix(eval(m$data, parent.frame())))
+ m$data <- as.data.frame(data)
+ m[[1]] <- as.name("model.frame")
+ m$... <- NULL
+ m$model <- NULL
+ m$predict <- NULL
+ m$estimator <- NULL
+ m$est.para <- NULL
+
+ mf <- eval(m, parent.frame())
+
+ response <- attr(attr(mf, "terms"), "response")
+ # just extract the data.frame, NA handling here
+ # make sure to leave the time and censoring variable here
+ # for "Surv(time, cens) ~ ." formulas
+ # delete terms attribute
+ attr(mf, "terms") <- NULL
+ y <- mf[,response]
+ if (!NOPRED & !is.Surv(y))
+ data <- mf
+ else
+ data <- data[complete.cases(data),]
+
+ if(all(estimator %in% c("boot", "632plus")) & all(c("boot", "632plus") %in% estimator)) {
+ estimator <- paste(sort(estimator), collapse = "_")
+ } else {
+ if(length(estimator) > 1) {
+ estimator <- estimator[1]
+# warning(paste("Multiple choice of estimators, only", estimator, "is performed"))
+ } else {
+ estimator <- match.arg(estimator)
+ }
+ }
+
+ if(is.null(model))
+ stop("no model specified")
+
+ switch(estimator,
+ "cv" = {
+ RET <- cv(y, formula, data, model=model, predict=predict,
+ k=est.para$k, random=est.para$random,
+ predictions=est.para$predictions, strat=est.para$strat,
+ getmodels=est.para$getmodels, list.tindx = est.para$list.tindx, ...)
+ }, "boot" = {
+ RET <- bootest(y, formula, data, model=model, predict=predict,
+ nboot=est.para$nboot, list.tindx = est.para$list.tindx, predictions = est.para$predictions, ...)
+ }, "632plus" = {
+ RET <- bootest(y, formula, data, model=model, predict=predict,
+ nboot=est.para$nboot, bc632plus=TRUE, list.tindx = est.para$list.tindx, predictions = est.para$predictions, ...)
+ }, "632plus_boot" = {
+ RET <- bootest(y, formula, data, model=model, predict=predict,
+ nboot=est.para$nboot, bc632plus = TRUE, list.tindx = est.para$list.tindx, predictions = est.para$predictions, both.boot = TRUE, ...)
+ }
+ )
+ }
+ RET$call <- cl
+ return(RET)
+}
+
+errorestinclass <- function(formula, data, subset=NULL, na.action=NULL,
+ model=NULL, predict=NULL,
+ estimator = c("cv", "boot", "632plus"),
+ est.para = control.errorest(), ...) {
+ if (is.null(data)) stop("data argument required but not given")
+# if (is.null(iclass))
+# stop("no class membership variable for indirect classification given")
+ iclass <- paste(formula[[2]][[2]])
+ if (!(iclass %in% colnames(data)))
+ stop("membership variable not in given data")
+
+ # <FIXME>
+# data <- data[complete.cases(data),]
+ # </FIXME>
+
+ iclassindx <- which(colnames(data) == iclass)
+
+ y <- data[,iclassindx]
+ if (!is.factor(y)) stop("iclass is not a factor")
+# X <- data[,-iclassindx]
+ X <- data
+
+ if(is.null(model))
+ stop("no classifier specified")
+
+ switch(estimator, "cv" = {
+ RET <- cv(y, formula, data=X, model=model, predict=predict,
+ k=est.para$k, random=est.para$random, list.tindx = est.para$list.tindx, ...)
+ }, "boot" = {
+ RET <- bootest(y, formula, data=X, model=model, predict=predict,
+ nboot=est.para$nboot, list.tindx = est.para$list.tindx, ...)
+ }, "632plus" = {
+ RET <- bootest(y, formula, data=X, model=model, predict=predict,
+ nboot=est.para$nboot, bc632plus=TRUE, list.tindx = est.para$list.tindx, ...)
+ })
+ RET
+}
diff --git a/R/inbagg.R b/R/inbagg.R
new file mode 100644
index 0000000..240447a
--- /dev/null
+++ b/R/inbagg.R
@@ -0,0 +1,241 @@
+workhorse.inbagg <- function(object, y, X, W,
+ cFUN, w.training.set, y.training.set, bcontrol, control, ...)
+{
+ formula.list <- object
+ data <- data.frame(y, X, W)
+ mtrees <- vector(mode="list", length=bcontrol$nbagg)
+ if(w.training.set[1] == "all") fit.vals <- 1:length(y)
+
+ for (i in 1:bcontrol$nbagg) {
+ bindx <- sample(1:length(y), bcontrol$ns, replace=bcontrol$replace)
+ if(w.training.set[1] == "oob") fit.vals <- (-bindx)
+ if(w.training.set[1] == "bag") fit.vals <- bindx
+
+ objs <- vector(mode="list", length=length(formula.list)) #prediction models for intermediate variables
+ names(objs) <- names(formula.list)
+
+ addclass <- function() { ##START addclass <- function()
+ for (j in 1:length(formula.list)) { ##Fitting prediction models for intermediates
+ oX <- data[fit.vals, c(paste(formula.list[[j]]$formula[[2]]),
+ attr(terms(formula.list[[j]]$formula, dataa = data), "term.labels"))]
+ foo <- try(formula.list[[j]]$model(formula.list[[j]]$formula, data = oX))
+ objs[[j]] <- foo
+ }
+
+ fct <- function(newdata) { ##START fct <- function(newdata)
+ if (!is.data.frame(newdata))
+ newdata <- as.data.frame(newdata)
+ add.predictors <- rep(0, nrow(newdata))
+
+ for (j in 1:length(formula.list)){ ## predict additional intermediates using fitted models
+ oXnewdata <- newdata[,attr(terms(formula.list[[j]]$formula, data = data), "term.labels")]
+ if(is.null(formula.list[[j]]$predict)) {
+ res <- try(predict(objs[[j]], newdata = oXnewdata))
+ } else {
+ res <- try(formula.list[[j]]$predict(objs[[j]], newdata = oXnewdata))
+ }
+###FIX: action for class(res) == "try-error"
+ add.predictors <- data.frame(add.predictors, res)
+ }
+ add.predictors <- add.predictors[,-1]
+ if(is.null(dim(add.predictors))) add.predictors <- matrix(add.predictors, ncol = 1)
+ colnames(add.predictors) <- names(formula.list)
+ add.predictors
+ }
+ ##END fct <- function(newdata)
+ return(fct)
+ } ##END addclass <- function()
+
+
+ bfct <- addclass() ###bfct is a function (addclass)
+
+ if (!is.null(bfct)) {
+ expl.cFUN <- attr(terms(as.formula(cFUN$formula), data = data), "term.labels")
+
+ if(!is.null(cFUN$fixed.function)) {
+ btree <- cFUN
+ } else {
+ W.new <- bfct(X)
+ W.new.names <- sub(".[0-9]$", "", colnames(W.new))
+
+ if(y.training.set[1] == "fitted.bag") { ###contstruct on bag
+ oX <- data.frame(y, X, W.new)[bindx,]
+ right.side <- paste(c(expl.cFUN[!(expl.cFUN %in% W.new.names)], colnames(W.new)[W.new.names %in% expl.cFUN]), collapse = "+")
+ cFUN$formula <- as.formula(paste(cFUN$formula[[2]], "~", right.side))
+ }
+
+ if(y.training.set[1] == "original") { ###construct on original variables
+ if(length(W.new.names)> length(colnames(W))) stop("If classifying function is trained on original intermediate, only one predictive model per intermediate can be constructed.")
+ oX <- data.frame(y, X, W[,W.new.names])
+ names(oX)[(ncol(oX)-ncol(W)+1):ncol(oX)] <- colnames(W.new)
+ }
+
+ if(y.training.set[1] == "fitted.subset") { ###construct on subset
+ oX <- data.frame(y, X, W.new)[!subset,]
+ right.side <- paste(c(expl.cFUN[!(expl.cFUN %in% W.new.names)], colnames(W.new)[W.new.names %in% expl.cFUN]), collapse = "+")
+ cFUN$formula <- as.formula(paste(cFUN$formula[[2]], "~", right.side))
+ }
+ names(oX)[names(oX) == "y"] <- paste(cFUN$formula[[2]])
+ btree <- cFUN$model(cFUN$formula, data = oX, ...)
+ btree <- list(model = btree, predict = cFUN$predict)
+ }
+
+ this <- list(bindx = bindx, btree = btree, bfct=bfct)
+ } else {
+ stop("Predictive function for intermediates not executable: Classifying function can not be applied.")
+ }
+ class(this) <- "thisclass"
+ mtrees[[i]] <- this
+ }
+ mtrees
+}
+
+
+
+
+inbagg <- function(formula, data, ...) UseMethod("inbagg", data)
+
+inbagg.default <- function(formula, data,...)
+{
+ stop(paste("Do not know how to handle objects of class", class(data)))
+}
+
+
+inbagg.data.frame <- function(formula, data, pFUN=NULL,
+ cFUN=list(model = NULL, predict = NULL, training.set = NULL),
+ nbagg = 25, ns = 0.5, replace = FALSE, ...)
+{
+ if(!is.function(cFUN)) {
+ if(is.null(cFUN$model)) {
+ cFUN$model <- function(formula, data)
+ rpart(formula, data, control = rpart.control(minsplit=2, cp=0, xval=0))
+ if(is.null(cFUN$predict)) cFUN$predict <- function(object, newdata) predict(object, newdata, type = "class")
+ if(is.null(cFUN$training.set)) cFUN$trainig.set <- "fitted.bag"
+ }
+ }
+
+##check formula
+ if(missing(formula)
+ || (length(formula) != 3)
+ || (length(attr(terms(formula[-2], data = data), "term.labels")) < 1))
+ stop("formula missing or incorrect")
+
+ m <- match.call(expand.dots = FALSE)
+ if(is.matrix(eval(m$data, parent.frame())))
+ m$data <- as.data.frame(data)
+
+##editing formula
+ if(length(formula[[2]])==3) {
+ if(!is.function(cFUN)) {
+ if (is.null(cFUN$formula)) y.formula <- as.formula(formula[[2]]) else y.formula <- cFUN$formula
+ }
+
+ w.formula <- XX~YY
+ w.formula[[2]] <- formula[[2]][[3]]
+ w.formula[[3]] <- formula[[3]]
+
+ response <- paste(formula[[2]][[2]])
+ w.names <- attr(terms(as.formula(formula[[2]]), data = data), "term.labels")
+ x.names <- attr(terms(as.formula(formula), data = data), "term.labels")
+
+ if(length(x.names == 1) && x.names == ".") x.names <- colnames(data)[!(colnames(data) %in% c(response, w.names))]
+ y <- data[, response]
+ X <- data[, x.names]
+ W <- data[, w.names]
+ if(is.null(dim(X))) X <- matrix(X, ncol = 1, dimnames = list(rownames(W), x.names))
+ if(is.null(dim(W))) W <- matrix(W, ncol = 1, dimnames = list(rownames(X), w.names))
+
+ if(is.function(cFUN)) {
+ y.formula <- as.formula(paste(formula[[2]][[2]], "~", paste(c(x.names, w.names), collapse = "+")))
+ fixed.function <- cFUN
+ cFUN <- list()
+ cFUN$fixed.function <- fixed.function
+ }
+ cFUN$formula <- y.formula
+
+ } else {
+ stop(paste("Specified formula has to be of type y~x~w"))
+ }
+##remove settings of training.set
+ if(is.null(pFUN$training.set)) w.training.set <- "oob" else w.training.set <- pFUN$training.set[1]
+ pFUN$training.set <- NULL
+
+ if(is.null(cFUN$training.set)) y.training.set <- "fitted.bag" else y.training.set <- cFUN$training.set[1]
+ cFUN$training.set <- NULL
+
+ bcontrol <- list(nbagg = nbagg, ns = length(y)*ns, replace = replace)
+
+ if(is.null(w.formula)) stop("no formula for prediction model specified")
+
+ ##formula.list : list of lists which specify an abitrary number of models for intermediate variables:
+ ##w1.1, w2.1, w3.1, ...., w2.1, w2.2, w3.1, .... where 'w*' is the variable and '.*' describes the model
+
+ P <- length(pFUN)
+ number.models <- c()
+ for(i in 1:P) {
+ if(is.null(pFUN[[i]]$formula)) pFUN[[i]]$formula <- w.formula
+ number.models <- c(number.models,
+ paste(attr(terms(pFUN[[i]]$formula[-3], data = data), "term.labels"),
+ ".", i, sep = ""))
+ }
+
+ formula.list <- vector(mode = "list", length= length(number.models))
+ names(formula.list) <- paste(number.models)
+
+ for(i in 1:P) {
+ res <- list()
+ Qi <- length(attr(terms(pFUN[[i]]$formula[-3], data = data), "term.labels"))
+ for(j in 1:Qi) {
+ res$formula <- w.formula
+ res$formula[[2]] <- as.name(attr(terms(res$formula[-3], data = data), "term.labels")[j])
+ res$formula[[3]] <- pFUN[[i]]$formula[[3]]
+
+ if(res$formula[[3]] == ".") res$formula <- as.formula(paste(res$formula[[2]], "~", paste(x.names, collapse= "+")))
+ res$model <- pFUN[[i]]$model
+ res$predict <- pFUN[[i]]$predict
+ formula.list[[paste(res$formula[[2]], ".", i, sep = "")]] <- res
+ }
+ }
+
+##apply
+ res <- workhorse.inbagg(object = formula.list, y = y, X = X, W = W,
+ cFUN = cFUN, w.training.set = w.training.set, y.training.set = y.training.set,
+ bcontrol = bcontrol, ...)
+ RET <- list(mtrees = res, y = y, W = W, X = X)
+ class(RET) <- "inbagg"
+ RET
+}
+
+
+print.inbagg <- function(x, ...)
+{
+ q <- length(x$mtrees)
+ intermediates <- attr(x$W, "names")
+ text.intermediates <- paste("Indirect bagging, with", q,
+ "bootstrap samples and intermediate variables: \n",
+ paste(intermediates, collapse = " "))
+ cat("\n", text.intermediates, "\n")
+}
+
+
+summary.inbagg <- function(object, ...)
+{
+ class(object) <- "summary.inbagg"
+ object
+}
+
+
+print.summary.inbagg <- function(x, ...)
+{
+ q <- length(x$mtrees)
+ intermediates <- attr(x$W, "names")
+
+ text.intermediates <- paste("Indirect bagging, with", q,
+"bootstrap samples and intermediate variables:", paste(intermediates, collapse = " "))
+
+ cat("\n", text.intermediates, "\n")
+ for(i in 1:length(x)) {
+ print(x$mtrees[[i]])
+ }
+}
+
diff --git a/R/inclass.R b/R/inclass.R
new file mode 100644
index 0000000..c837b47
--- /dev/null
+++ b/R/inclass.R
@@ -0,0 +1,190 @@
+# $Id: inclass.R,v 1.33 2008/08/04 08:18:41 hothorn Exp $
+
+inclass <- function(formula, data, ...) UseMethod("inclass", data)
+
+inclass.default <- function(formula, data, ...)
+{
+ stop(paste("Do not know how to handle objects of class", class(data)))
+}
+
+inclass.data.frame <- function(formula, data, pFUN = NULL, cFUN = NULL, ...)
+{
+##check formula
+ if(missing(formula)
+ || (length(formula) != 3)
+ || (length(attr(terms(formula[-2], data = data), "term.labels")) < 1))
+ stop("formula missing or incorrect")
+
+ m <- match.call(expand.dots = FALSE)
+ if(is.matrix(eval(m$data, parent.frame())))
+ m$data <- as.data.frame(data)
+
+##editing formula
+###main formula
+ if(length(formula[[2]])==3) {
+ if(is.function(cFUN)) y.formula <- formula[[2]] else y.formula <- cFUN$formula
+ w.formula <- XX~YY
+ w.formula[[2]] <- formula[[2]][[3]]
+ w.formula[[3]] <- formula[[3]]
+
+ response <- paste(formula[[2]][[2]])
+ w.names <- attr(terms(as.formula(formula[[2]]), data = data), "term.labels")
+ x.names <- attr(terms(as.formula(formula), data = data), "term.labels")
+
+ if(x.names[1] == ".") x.names <- colnames(data)[!(colnames(data) %in% c(response, w.names))]
+ } else {
+ stop(paste("Specified formula has to be of type y~x~w"))
+ }
+
+
+ if(is.null(w.formula)) stop("no formula for prediction model specified")
+
+ formula.list <- vector(mode = "list", length= length(w.names))
+ names(formula.list) <- w.names
+
+ P <- length(pFUN)
+ Qi <- length(w.names)
+ for(j in 1:Qi) {
+ res <- list()
+ res$formula <- w.formula
+ res$formula[[2]] <- as.name(attr(terms(res$formula[-3], data = data), "term.labels")[j])
+ if(res$formula[[3]] == ".") {
+ res$formula <- as.formula(paste(res$formula[[2]], "~", paste(x.names, collapse= "+")))
+ }
+ for(i in 1:P) {
+ if(is.null(pFUN[[i]]$formula)) {
+ if(is.null(formula.list[[w.names[j]]]$formula)) formula.list[[w.names[j]]]$formula <- res$formula
+ if(is.null(formula.list[[w.names[j]]]$model)) formula.list[[w.names[j]]]$model <- pFUN[[i]]$model
+ if(is.null(formula.list[[w.names[j]]]$predict)) formula.list[[w.names[j]]]$predict <- pFUN[[i]]$predict
+ } else {
+ QQ <- attr(terms(pFUN[[i]]$formula[-3], data = data), "term.labels")
+ for(k in QQ) {
+ if(w.names[j] == k) {
+ res$formula[[3]] <- pFUN[[i]]$formula[[3]]
+ if(paste(pFUN[[i]]$formula[[3]]) == ".") {
+ res$formula[[3]] <- as.formula(paste(w.names[j], "~", paste(x.names, collapse= "+")))
+ }
+ formula.list[[w.names[j]]]$formula <- pFUN[[i]]$formula
+ formula.list[[w.names[j]]]$model <- pFUN[[i]]$model
+ formula.list[[w.names[j]]]$predict <- pFUN[[i]]$predict
+ }
+ }
+ }
+
+ }
+ }
+
+ if(!is.function(cFUN)) {
+ cFUN$formula <- y.formula
+ if(is.null(cFUN$training.set)) cFUN$training.set <- "original"
+ }
+
+ result <- workhorse.inclass(object = formula.list, data = data, cFUN = cFUN, ...)
+ return(result)
+}
+
+
+workhorse.inclass <- function(object, data, cFUN, subset, na.action, ...)
+{
+ formula.list <- object
+ q <- length(formula.list)
+
+ result <- list()
+ namen <- c()
+
+##model fitting
+ for(i in 1:q) {
+ formula <- formula.list[[i]]$formula
+##check necessary?? >
+ if(missing(formula)
+ || (length(formula) != 3)
+ || (length(attr(terms(formula[-2], data = data), "term.labels")) < 1)
+ || (length(attr(terms(formula[-3], data = data), "term.labels")) != 1))
+ stop("formula missing or incorrect")
+## check necessary?? <
+ m <- match.call(expand.dots= FALSE)
+ res <- formula.list[[i]]$model(formula = formula, data = data)
+
+ namen <- c(namen, as.character(formula[[2]]))
+ result <- c(result, list(res))
+ }
+ names(result) <- namen
+
+ if(!is.function(cFUN)) {
+ ###cFUN can be trained on original intermediate variables or on fitted values or on the subset
+ if(!is.null(m$subset) && cFUN$training.set == "subset") dataresp <- data[!subset, ]
+ if(cFUN$training.set == "original") dataresp <- data
+ if(cFUN$training.set == "fitted") {
+ dataresp <- data
+ for(i in 1:q){
+ if(!is.null(formula.list[[i]]$predict)){
+ dataresp[,namen[i]] <- formula.list[[i]]$predict(result[[i]], newdata = data)} else {
+ dataresp[,namen[i]] <- predict(result[[i]], newdata = data)
+ }
+ }
+ }
+ model.response <- cFUN$model(as.formula(cFUN$formula), data = dataresp, ...)
+ } else {
+ model.response <- cFUN
+ }
+
+###predict specificatiations are not delivered
+ result <- list("model.intermediate" = result, "model.response" = model.response, "para.intermediate" = object, "para.response" = cFUN)
+
+ class(result) <- "inclass"
+ return(result)
+}
+
+
+print.inclass <- function(x, ...)
+{
+ x <- x$model.intermediate
+ q <- length(x)
+ intermediates <- attr(x, "names")
+ classes <- c()
+ for(i in 1:q) {
+ classes <- c(classes, class(x[[i]]))
+ }
+
+ text.intermediates <- paste("Indirect classification, with", q, "intermediate variables:")
+ if(length(unique(classes)) == 1) {
+ predictive <- paste("Predictive model per intermediate is", unique(classes))
+ } else {
+ predictive <- paste("Predictive model per intermediate is \n",
+ paste(intermediates, ": ", classes, "\n ", collapse = ""))
+ }
+ cat("\n", text.intermediates, "\n", intermediates, "\n", "\n", predictive, "\n")
+}
+
+
+summary.inclass <- function(object, ...)
+{
+ class(object) <- "summary.inclass"
+ object
+}
+
+
+print.summary.inclass <- function(x, ...)
+{
+ x <- x$model.intermediate
+ q <- length(x)
+ intermediates <- attr(x, "names")
+ classes <- c()
+ for(i in 1:q) {
+ classes <- c(classes, class(x[[i]]))
+ }
+
+ text.intermediates <- paste("Indirect classification, with", q, "intermediate variables:")
+ if(length(unique(classes)) == 1) {
+ predictive <- paste("Predictive model per intermediate is", unique(classes))
+ } else {
+ predictive <- paste("Predictive model per intermediate is", "\n ",
+ paste(intermediates, ": ", classes, "\n ", collapse = ""))
+ }
+ cat("\n", text.intermediates, "\n", intermediates, "\n", "\n", predictive,
+ "\n", "\n", "Models:", "\n")
+ print(x)
+
+}
+
+
diff --git a/R/ipredbagg.R b/R/ipredbagg.R
new file mode 100644
index 0000000..7e2a850
--- /dev/null
+++ b/R/ipredbagg.R
@@ -0,0 +1,266 @@
+#$Id: ipredbagg.R,v 1.13 2003/06/11 10:40:17 peters Exp $
+
+workhorse <- function(y, X, control, comb, bcontrol, thisclass, ...) {
+ # This is double-bagging (comb is lda) or bundling (any arbritrary
+ # model in comb)
+ if (!is.data.frame(X)) X <- as.data.frame(X)
+
+ # check user supplied functions
+ if (!is.list(comb)) stop("comb not a list")
+
+ N <- nrow(X)
+
+ mydata <- cbind(data.frame(y), X)
+ mtrees <- vector(mode="list", length=bcontrol$nbagg)
+
+ for (i in 1:bcontrol$nbagg) {
+ # double-bagging or bundling
+ # comb is a list of lists, each of them having two elements:
+ # model and predict
+
+ bindx <- sample(1:N, bcontrol$ns, replace=bcontrol$replace)
+
+ objs <- vector(mode="list", length=length(comb))
+ addclass <- function() {
+ myindx <- 1:length(comb)
+ for (k in 1:length(comb)) {
+ # put the user supplied models into a try statement
+ # if this fails, simply ignore it.
+ # options(show.error.messages = FALSE)
+ oX <- mydata[-bindx,]
+ foo <- try(comb[[k]]$model(y ~ ., data=oX))
+ if (inherits(foo, "try-error")) {
+ warning("could not build model:")
+ print(foo[1])
+ foo <- NA
+ myindx <- myindx[-k]
+ }
+ objs[[k]] <- foo
+ # options(show.error.messages = TRUE)
+ }
+ fct <- function(newdata) {
+ # use lexical scoping: return this function for the computation of
+ # the additional predictors
+ if (!is.data.frame(newdata))
+ newdata <- as.data.frame(newdata)
+ addpred <- c()
+ # the user supplied model failed, ignore it here.
+ if (length(myindx) < 1) {
+ RET <- NULL
+ } else {
+ # compute additional predictors for user supplied models
+ for (k in myindx)
+ addpred <- cbind(addpred, comb[[k]]$predict(objs[[k]], newdata))
+ # <FIXME>: more informative names???
+ colnames(addpred) <- paste("addpred", 1:ncol(addpred), sep="")
+ # </FIXME>
+ RET <- addpred
+ }
+ RET
+ }
+ if (length(myindx) < 1) return(NULL) else return(fct)
+ }
+ bfct <- addclass()
+ # may have failed
+ if (!is.null(bfct)) {
+ # grow a tree using the original predictors
+ # from the bootstrap sample and the additional predictors computed on
+ # the bootstrap sample.
+ oX <- cbind(mydata, bfct(X))[bindx,]
+ btree <- rpart(y ~., data=oX, control = control,...)
+ # return this object
+ this <- list(bindx = bindx, btree = btree, bfct=bfct)
+ } else {
+ # return a simple tree if the user supplied model failed.
+ oX <- mydata[bindx,]
+ btree <- rpart(y ~., data=oX, control = control,...)
+ this <- list(bindx = bindx, btree = btree)
+ }
+ class(this) <- thisclass
+ mtrees[[i]] <- this
+ }
+ mtrees
+}
+
+
+ipredbagg <- function(y, ...) {
+ if(is.null(class(y)))
+ class(y) <- data.class(y)
+# UseMethod("ipredbagg", y, ...)
+ UseMethod("ipredbagg", y)
+}
+
+ipredbagg.default <- function(y, ...) {
+ stop(paste("Do not know how to handle objects of class", class(y)))
+}
+
+ipredbagg.integer <- function(y, ...) {
+ ipredbagg.numeric(y,...)
+}
+
+
+ipredbagg.factor <- function(y, X=NULL, nbagg=25, control=
+ rpart.control(minsplit=2, cp=0, xval=0),
+ comb=NULL, coob=FALSE, ns=length(y), keepX =
+ TRUE, ...) {
+ # bagging classification trees
+
+ if (!is.null(comb) && coob)
+ stop("cannot compute out-of-bag estimate for combined models")
+
+ if (nbagg == 1 && coob)
+ stop("cannot compute out-of-bag estimate for single tree")
+
+ # check nbagg
+ if (nbagg < 1) stop("nbagg is not a positive integer")
+ # bagging only if nbagg greater 1, else use the whole sample, i.e. one
+ # simple tree
+ if (nbagg == 1) {
+ REPLACE <- FALSE
+ } else {
+ if (ns < length(y)) {
+ # this is "subagging", i.e. sampling ns out of length(y) WITHOUT
+ # replacement
+ REPLACE <- FALSE
+ } else {
+ # the usual bootstrap: n out of n with replacement
+ REPLACE <- TRUE
+ }
+ }
+
+ if (!is.null(comb)) {
+ # this is rather slow but we need to be as general as possible
+ # with respect to classifiers as well as outcome of prediction (classes,
+ # linear discriminant functions, conditional class probabilities, random
+ # noise, if you like)
+ mtrees <- workhorse(y, X, control, comb,
+ bcontrol=list(nbagg=nbagg, ns=ns, replace=REPLACE),
+ thisclass="sclass")
+ } else {
+ # use an optimized version
+ mydata <- cbind(data.frame(y), X)
+ mtrees <- irpart(y ~ ., data=mydata, control=control,
+ bcontrol=list(nbagg=nbagg, ns=ns, replace=REPLACE))
+ }
+ # always keep response and predictors as well as a list of nbagg objects
+ # of class "sclass"
+ if (keepX)
+ RET <- list(y=y, X=X, mtrees=mtrees, OOB=coob, comb=!is.null(comb))
+ else
+ RET <- list(y=y, X=NULL, mtrees=mtrees, OOB=coob, comb=!is.null(comb))
+ class(RET) <- "classbagg"
+
+ if (coob) {
+ pred <- predict(RET)
+ ae <- all.equal(levels(pred), levels(RET$y))
+ if (is.logical(ae) && ae)
+ RET$err <- mean(pred != RET$y, na.rm=TRUE)
+ else
+ RET$err <- mean(as.character(pred) != as.character(RET$y),
+ na.rm=TRUE)
+ }
+ RET
+}
+
+ipredbagg.numeric <- function(y, X=NULL, nbagg=25, control=
+ rpart.control(xval=0),
+ comb=NULL, coob=FALSE, ns=length(y), keepX =
+ TRUE, ...) {
+ # <FIXME> is control meaningful here??? </FIXME>
+
+ # bagging regression trees
+
+ if (!is.null(comb) && coob)
+ stop("cannot compute out-of-bag estimate for combined models")
+
+ if (nbagg == 1 && coob)
+ stop("cannot compute out-of-bag estimate for single tree")
+
+ # check nbagg
+ if (nbagg < 1) stop("nbagg is not a positive integer")
+ # only bagg if nbagg greater 1, else use the whole sample
+ if (nbagg == 1) {
+ REPLACE <- FALSE
+ } else {
+ if (ns < length(y)) {
+ # this is "subagging", i.e. sampling ns out of length(y) WITHOUT
+ # replacement
+ REPLACE <- FALSE
+ } else {
+ # the usual bootstrap: n out of n with replacement
+ REPLACE <- TRUE
+ }
+ }
+
+ if (!is.null(comb)) {
+ mtrees <- workhorse(y, X, control, comb,
+ bcontrol=list(nbagg=nbagg, ns=ns, replace=REPLACE),
+ thisclass="sreg")
+ } else {
+ mydata <- cbind(data.frame(y), X)
+ mtrees <- irpart(y ~ ., data=mydata, control=control,
+ bcontrol=list(nbagg=nbagg, ns=ns, replace=REPLACE))
+ }
+
+ if (keepX)
+ RET <- list(y=y, X=X, mtrees=mtrees, OOB=coob, comb=!is.null(comb))
+ else
+ RET <- list(y=y, X=NULL, mtrees=mtrees, OOB=coob, comb=!is.null(comb))
+ class(RET) <- "regbagg"
+
+ if (coob)
+ RET$err <- sqrt(mean((predict(RET) - RET$y)^2, na.rm=TRUE))
+ RET
+}
+
+
+ipredbagg.Surv <- function(y, X=NULL, nbagg=25, control=
+ rpart.control(xval=0),
+ comb=NULL, coob=FALSE, ns=dim(y)[1], keepX =
+ TRUE, ...) {
+ # <FIXME> is control meaningful here??? </FIXME>
+
+ # bagging survival trees
+
+ if (!is.null(comb) && coob)
+ stop("cannot compute out-of-bag estimate for combined models")
+
+ if (nbagg == 1 && coob)
+ stop("cannot compute out-of-bag estimate for single tree")
+
+ # check nbagg
+ if (nbagg < 1) stop("nbagg is not a positive integer")
+ # only bagg if nbagg greater 1, else use the whole sample
+ if (nbagg == 1) {
+ REPLACE <- FALSE
+ } else {
+ if (ns < dim(y)[1]) {
+ # this is "subagging", i.e. sampling ns out of length(y) WITHOUT
+ # replacement
+ REPLACE <- FALSE
+ } else {
+ # the usual bootstrap: n out of n with replacement
+ REPLACE <- TRUE
+ }
+ }
+
+ if (!is.null(comb)) {
+ mtrees <- workhorse(y, X, control, comb,
+ bcontrol=list(nbagg=nbagg, ns=ns, replace=REPLACE),
+ thisclass="ssurv")
+ } else {
+ mydata <- cbind(data.frame(y), X)
+ mtrees <- irpart(y ~ ., data=mydata, control=control,
+ bcontrol=list(nbagg=nbagg, ns=ns, replace=REPLACE))
+ }
+ if (keepX)
+ RET <- list(y=y, X=X, mtrees=mtrees, OOB=coob, comb=!is.null(comb))
+ else
+ RET <- list(y=y, X=NULL, mtrees=mtrees, OOB=coob, comb=!is.null(comb))
+ class(RET) <- "survbagg"
+
+ if (coob)
+ RET$err <- sbrier(RET$y, predict(RET))
+ RET
+}
+
diff --git a/R/ipredknn.R b/R/ipredknn.R
new file mode 100644
index 0000000..2478c4c
--- /dev/null
+++ b/R/ipredknn.R
@@ -0,0 +1,69 @@
+# $Id: ipredknn.R,v 1.5 2005/06/29 08:50:28 hothorn Exp $
+
+# k-NN compatible with the fit(formula) - predict(object) framework
+
+ipredknn <- function(formula, data, subset, na.action, k=5, ...) {
+ cl <- match.call()
+ if(missing(formula)
+ || (length(formula) != 3)
+ || (length(attr(terms(formula[-2], data = data), "term.labels")) < 1)
+ || (length(attr(terms(formula[-3], data = data), "term.labels")) != 1))
+ stop("formula missing or incorrect")
+ m <- match.call(expand.dots = FALSE)
+ if(is.matrix(eval(m$data, parent.frame())))
+ m$data <- as.data.frame(data)
+ m[[1]] <- as.name("model.frame")
+ m$... <- NULL
+ m$k <- NULL
+ m <- eval(m, parent.frame())
+ Terms <- attr(m, "terms")
+ y <- model.extract(m, "response")
+ x <- model.matrix(Terms, m)
+ xvars <- as.character(attr(Terms, "variables"))[-1]
+ if ((yvar <- attr(Terms, "response")) > 0) xvars <- xvars[-yvar]
+ xlev <- if (length(xvars) > 0) {
+ xlev <- lapply(m[xvars], levels)
+ xlev[!sapply(xlev, is.null)]
+ }
+ xint <- match("(Intercept)", colnames(x), nomatch=0)
+ if(xint > 0) x <- x[, -xint, drop=FALSE]
+ RET <- list(learn=list(y=y, X=x))
+ RET$k <- k
+ RET$terms <- Terms
+ RET$call <- match.call()
+ RET$contrasts <- attr(x, "contrasts")
+ RET$xlevels <- xlev
+ attr(RET, "na.message") <- attr(m, "na.message")
+ if(!is.null(attr(m, "na.action"))) RET$na.action <- attr(m, "na.action")
+ class(RET) <- "ipredknn"
+ RET
+}
+
+predict.ipredknn <- function(object, newdata, type=c("prob", "class"), ...) {
+ type <- match.arg(type)
+ if(!inherits(object, "ipredknn")) stop("object not of class ipredknn")
+ if(!is.null(Terms <- object$terms)) { #
+ # formula fit (only)
+ if(missing(newdata)) newdata <- model.frame(object)
+ else {
+ newdata <- model.frame(as.formula(delete.response(Terms)),
+ newdata, na.action=function(x) x,
+ xlev = object$xlevels)
+ }
+ x <- model.matrix(delete.response(Terms), newdata,
+ contrasts = object$contrasts)
+ xint <- match("(Intercept)", colnames(x), nomatch=0)
+ if(xint > 0) x <- x[, -xint, drop=FALSE]
+ } else {
+ stop("object has no terms element")
+ }
+# <FIXME>: check for variable names
+# if(length(colnames(x)) > 0 &&
+# any(colnames(x) != dimnames(object$means)[[2]]))
+# warning("Variable names in newdata do not match those in object")
+# </FIXME>
+ RET <- knn(object$learn$X, x,
+ object$learn$y, k=object$k, prob=TRUE)
+ if (type=="prob") return(attr(RET, "prob"))
+ else return(RET)
+}
diff --git a/R/irpart.R b/R/irpart.R
new file mode 100644
index 0000000..6a6fdf2
--- /dev/null
+++ b/R/irpart.R
@@ -0,0 +1,57 @@
+#
+# use update to fit multiple trees to bootstrap samples
+#
+irpart <- function(formula, data=NULL, weights, subset,
+ na.action=na.rpart, method, model=FALSE, x=FALSE, y=TRUE,
+ parms, control, cost, bcontrol, ...)
+{
+
+ mc <- match.call()
+ mc$bcontrol <- NULL
+ mc[[1]] <- as.name("rpart")
+
+ m <- match.call(expand.dots=FALSE)
+ m$model <- m$method <- m$control <- m$bcontrol <- NULL
+ m$x <- m$y <- m$parms <- m$... <- NULL
+ m$cost <- NULL
+ m$na.action <- na.action
+ m[[1]] <- as.name("model.frame.default")
+ m <- eval(m, parent.frame())
+
+ init_tree <- eval(mc, parent.frame())
+ nobs <- length(init_tree$where)
+ if (missing(weights)) {
+ weights <- rep(1.0, nobs)
+ } else {
+ warning("weights argument ignored in irpart")
+ }
+
+ yclasses <- c(class = "sclass", exp = "ssurv", anova = "sreg", poisson = "sreg")
+
+ #
+ # Bagging: repeat this several times!
+ #
+
+ if (is.null(bcontrol)) stop("bcontrol not given")
+ mod <- vector(mode="list", length=bcontrol$nbagg)
+
+ for (b in 1:bcontrol$nbagg) {
+ if (bcontrol$nbagg > 1)
+ bindx <- sample(1:nobs, bcontrol$ns, replace=bcontrol$replace)
+ else
+ bindx <- 1:nobs
+ tab <- tabulate(bindx, nbins = nobs)
+
+ mc$data <- m[bindx,,drop = FALSE] ### tab * weights
+ ans <- eval(mc, parent.frame())
+
+ # return the appropriate class
+ this <- list(bindx = bindx, btree = ans)
+
+ class(this) <- yclasses[init_tree$method]
+
+ mod[[b]] <- this
+ }
+ mod
+}
+
diff --git a/R/kfoldcv.R b/R/kfoldcv.R
new file mode 100644
index 0000000..e8528da
--- /dev/null
+++ b/R/kfoldcv.R
@@ -0,0 +1,21 @@
+# $Id: kfoldcv.R,v 1.3 2002/09/12 08:56:42 hothorn Exp $
+
+kfoldcv <- function(k,N, nlevel=NULL) {
+ if (is.null(nlevel)) {
+ # no stratification
+ if (k > N) return(c(rep(1, N), rep(0, k-N)))
+ fl <- floor(N/k)
+ ce <- ceiling(N/k)
+ if (fl == ce) return(rep(fl, k))
+ else
+ return(c(rep(ce, round((N/k - fl)*k)), rep(fl, round((1 - (N/k -
+ fl))*k))))
+ } else {
+ # stratification
+ # if (!is.integer(nlevel)) stop("nlevel is not a vector if integers")
+ kmat <- matrix(0, ncol=k, nrow=length(nlevel))
+ for (i in 1:length(nlevel))
+ kmat[i,] <- kfoldcv(k, nlevel[i])
+ return(kmat)
+ }
+}
diff --git a/R/mypredict.lm.R b/R/mypredict.lm.R
new file mode 100644
index 0000000..78c6739
--- /dev/null
+++ b/R/mypredict.lm.R
@@ -0,0 +1,32 @@
+# $Id: mypredict.lm.R,v 1.7 2003/04/02 11:22:49 peters Exp $
+
+mypredict.lm <- function(object, newdata) {
+
+ xn <- as.data.frame(newdata)
+
+ test <- attr(terms(object), "term.labels")
+ xn <- xn[,test]
+
+ if (!is.null(nrow(xn))) {
+ pred <- rep(NA, nrow(xn))
+ names(pred) <- row.names(xn)
+ } else {
+ pred <- NA
+ names(pred) <- "1"
+ }
+
+ # evaluate na.omit (delete lines containing NA)
+
+ xnn <- na.omit(xn)
+
+ # attr(xnn, "na.action") returns which na.action is
+ # evaluated, lines and corresponding row.name where NAs occur
+
+ if(is.null(attr(xnn, "na.action")))
+ pred <- predict(object, xnn)
+ else
+ pred[-attr(xnn, "na.action")] <- predict(object, xnn)
+
+ pred
+
+}
diff --git a/R/predict.bagging.R b/R/predict.bagging.R
new file mode 100644
index 0000000..4f0b9e6
--- /dev/null
+++ b/R/predict.bagging.R
@@ -0,0 +1,267 @@
+# $Id: predict.bagging.R,v 1.17 2009/03/27 16:18:38 hothorn Exp $
+
+uwhich.max <- function(x) {
+ # need to determine all maxima in order to sample from them
+ wm <- (1:length(x))[x == max(x)]
+ if (length(wm) > 1)
+ wm <- wm[sample(length(wm), 1)]
+ wm
+}
+
+predict.classbagg <- function(object, newdata=NULL, type=c("class", "prob"),
+ aggregation=c("majority", "average", "weighted"), ...) {
+ type <- match.arg(type)
+ agg <- match.arg(aggregation)
+ if (missing(newdata)) {
+ if (length(object$mtrees) < 10)
+ stop("cannot compute out-of-bag predictions for small number of trees")
+ OOB <- TRUE
+ if (!is.null(object$X))
+ newdata <- object$X
+ else
+ stop("cannot compute out-of-bag predictions without object$X!")
+ } else {
+ OOB <- FALSE
+ }
+ if (!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
+ N <- nrow(newdata)
+ if (!object$comb) {
+ tree <- object$mtrees[[1]]$btree
+ Terms <- delete.response(tree$terms)
+ act <- (tree$call)$na.action
+ if (is.null(act)) act<- na.rpart
+ newdata <- model.frame(Terms, newdata, na.action = act,
+ xlev=attr(tree, "xlevels"))
+ newdata <- getFromNamespace("rpart.matrix", ns = "rpart")(newdata)
+ }
+ classes <- levels(object$y)
+ switch(agg, "majority" = {
+ vote <- matrix(0, nrow=N, ncol=length(classes))
+ for (i in 1:length(object$mtrees)) {
+ if (OOB) {
+ bindx <- object$mtrees[[i]]$bindx
+ if (!is.null(object$mtrees[[i]]$bfct))
+ stop("cannot compute out-of-bag estimate for combined models!")
+ pred <- predict(object$mtrees[[i]], newdata, type="class")
+ tindx <- cbind((1:N), pred)[-bindx,]
+ } else {
+ tindx <- cbind(1:N, predict(object$mtrees[[i]], newdata,
+ type="class"))
+ }
+ vote[tindx] <- vote[tindx] + 1
+ }
+ if (type=="class") {
+ RET <- factor(classes[apply(vote, 1, uwhich.max)])
+ } else {
+ RET <- vote/apply(vote, 1, sum)
+ colnames(RET) <- classes
+ }
+ },
+ "average" = {
+ cprob <- matrix(0, nrow=N, ncol=length(classes))
+ if (OOB) ncount <- rep(0,N) else ncount <- length(object$mtrees)
+ for (i in 1:length(object$mtrees)) {
+ if (OOB) {
+ bindx <- object$mtrees[[i]]$bindx
+ pred <- predict(object$mtrees[[i]], newdata, type="prob")[-bindx,]
+ tindx <- (1:N)[-bindx]
+ ncount[tindx] <- ncount[tindx] + 1
+ } else {
+ pred <- predict(object$mtrees[[i]], newdata, type="prob")
+ tindx <- 1:N
+ }
+ cprob[tindx,] <- cprob[tindx,] + pred
+ }
+ switch(type, "class" = {
+ RET <- as.factor(apply(cprob, 1, uwhich.max))
+ levels(RET) <- classes
+ },
+ "prob" = {
+ ncount[ncount < 1] <- NA
+ RET <- cprob / ncount
+ colnames(RET) <- classes
+ })
+ },
+ "weighted" = {
+ agglsample <- matrix(0, ncol=length(classes), nrow=N)
+ for (i in 1:length(object$mtrees)) {
+ bdata <- object$y[object$mtrees[[i]]$bindx]
+ newpart <- getpartition(object$mtrees[[i]], newdata)
+ oldpart <- object$mtrees[[i]]$btree$where
+ if (OOB)
+ tindx <- (1:N)[-object$mtrees[[i]]$bindx]
+ else
+ tindx <- 1:N
+ for (j in tindx) {
+ aggobs <- table(bdata[oldpart == newpart[j]])
+ agglsample[j,] <- agglsample[j,] + aggobs
+ }
+ }
+ switch(type, "class" = {
+ RET <- c()
+ for (j in 1:N)
+ RET <- as.factor(c(RET, uwhich.max(agglsample[j,])))
+ levels(RET) <- classes
+ },
+ "prob" = {
+ RET <- agglsample / apply(agglsample, 1, sum)
+ colnames(RET) <- classes
+ })
+ })
+ RET
+}
+
+predict.sclass <- function(object, newdata=NULL, type=c("class", "prob"),
+...) {
+ if (!is.null(object$bfct))
+ newdata <- cbind(newdata, object$bfct(newdata))
+ pred <- predict.irpart(object$btree, newdata, type=type)
+ RET <- pred
+ if (type == "class") RET <- as.integer(pred)
+ if (type == "prob" && is.vector(pred)) RET <- cbind(pred, 1 - pred)
+ RET
+}
+
+
+predict.regbagg <- function(object, newdata=NULL, aggregation=c("average",
+"weighted"), ...) {
+ agg <- match.arg(aggregation)
+ if (missing(newdata)) {
+ if (length(object$mtrees) < 10)
+ stop("cannot compute out-of-bag predictions for small number of trees")
+ OOB <- TRUE
+ if (!is.null(object$X))
+ newdata <- object$X
+ else
+ stop("cannot compute out-of-bag predictions without object$X!")
+ } else {
+ OOB <- FALSE
+ }
+ if (!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
+ N <- nrow(newdata)
+ if (!object$comb) {
+ tree <- object$mtrees[[1]]$btree
+ Terms <- delete.response(tree$terms)
+ act <- (tree$call)$na.action
+ if (is.null(act)) act<- na.rpart
+ newdata <- model.frame(Terms, newdata, na.action = act,
+ xlev=attr(tree, "xlevels"))
+ newdata <- getFromNamespace("rpart.matrix", ns = "rpart")(newdata)
+ }
+ switch(agg, "average" = {
+ cprob <- rep(0, N)
+ if (OOB) ncount <- rep(0,N) else ncount <- length(object$mtrees)
+ for (i in 1:length(object$mtrees)) {
+ if (OOB) {
+ bindx <- object$mtrees[[i]]$bindx
+ if (!is.null(object$mtrees[[i]]$bfct))
+ stop("cannot compute out-of-bag estimate for combined models!")
+ pred <- predict(object$mtrees[[i]], newdata)[-bindx]
+ tindx <- (1:N)[-bindx]
+ ncount[tindx] <- ncount[tindx] + 1
+ } else {
+ pred <- predict(object$mtrees[[i]], newdata)
+ tindx <- 1:N
+ }
+ cprob[tindx] <- cprob[tindx] + pred
+ }
+ ncount[ncount < 1] <- NA
+ RET <- cprob / ncount
+ },
+ "weighted" = {
+ agglsample <- rep(0, N)
+ ncount <- rep(0, N)
+ for (i in 1:length(object$mtrees)) {
+ bdata <- object$y[object$mtrees[[i]]$bindx]
+ newpart <- getpartition(object$mtrees[[i]], newdata)
+ oldpart <- object$mtrees[[i]]$btree$where
+ if (OOB)
+ tindx <- (1:N)[-object$mtrees[[i]]$bindx]
+ else
+ tindx <- 1:N
+ for (j in tindx) {
+ aggobs <- bdata[oldpart == newpart[j]]
+ agglsample[j] <- agglsample[j] + sum(aggobs)
+ ncount[j] <- ncount[j] + length(aggobs)
+ }
+ }
+ ncount[ncount < 1] <- NA
+ RET <- agglsample / ncount
+ })
+ RET
+}
+
+
+predict.sreg <- function(object, newdata=NULL, ...) {
+ if (!is.null(object$bfct))
+ newdata <- cbind(newdata, object$bfct(newdata))
+ predict.irpart(object$btree, newdata)
+}
+
+
+predict.survbagg <- function(object, newdata=NULL, ...) {
+ if (missing(newdata)) {
+ if (length(object$mtrees) < 10)
+ stop("cannot compute out-of-bag predictions for small number of trees")
+ OOB <- TRUE
+ if (!is.null(object$X))
+ newdata <- object$X
+ else
+ stop("cannot compute out-of-bag predictions without object$X!")
+ } else {
+ OOB <- FALSE
+ }
+ if (!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
+ N <- nrow(newdata)
+ if (!object$comb) {
+ tree <- object$mtrees[[1]]$btree
+ Terms <- delete.response(tree$terms)
+ act <- (tree$call)$na.action
+ if (is.null(act)) act<- na.rpart
+ newdata <- model.frame(Terms, newdata, na.action = act,
+ xlev=attr(tree, "xlevels"))
+ newdata <- getFromNamespace("rpart.matrix", ns = "rpart")(newdata)
+ }
+ agglsample <- list()
+ aggcens <- list()
+ for (j in 1:N) {
+ agglsample <- c(agglsample, list(c()))
+ aggcens <- c(aggcens, list(c()))
+ }
+ for (i in 1:length(object$mtrees)) {
+ bdata <- object$y[object$mtrees[[i]]$bindx]
+ newpart <- getpartition(object$mtrees[[i]], newdata)
+ oldpart <- object$mtrees[[i]]$btree$where
+ if (OOB) {
+ if (!is.null(object$mtrees[[i]]$bfct))
+ stop("cannot compute out-of-bag estimate for combined models!")
+ tindx <- (1:N)[-object$mtrees[[i]]$bindx]
+ } else {
+ tindx <- 1:N
+ }
+ for (j in tindx) {
+ aggobs <- bdata[oldpart == newpart[j],1]
+ agglsample[[j]] <- c(agglsample[[j]], aggobs)
+ aggobs <- bdata[oldpart == newpart[j],2]
+ aggcens[[j]] <- c(aggcens[[j]], aggobs)
+ }
+ }
+ RET <- list()
+ for (j in 1:N)
+ RET <- c(RET, list(survfit(Surv(agglsample[[j]], aggcens[[j]]) ~ 1)))
+ RET
+}
+
+getpartition <- function(object, newdata=NULL) {
+ if (!is.null(object$bfct)) {
+ newdata <- cbind(newdata, object$bfct(newdata))
+ Terms <- delete.response(object$btree$terms)
+ act <- (object$btree$call)$na.action
+ if (is.null(act)) act<- na.rpart
+ newdata <- model.frame(Terms, newdata, na.action = act,
+ xlev=attr(object$btree, "xlevels"))
+ newdata <- getFromNamespace("rpart.matrix", ns = "rpart")(newdata)
+ }
+ getFromNamespace("pred.rpart", ns = "rpart")(object$btree, newdata)
+}
+
diff --git a/R/predict.inbagg.R b/R/predict.inbagg.R
new file mode 100644
index 0000000..bb862da
--- /dev/null
+++ b/R/predict.inbagg.R
@@ -0,0 +1,30 @@
+predict.inbagg <- function(object, newdata, ...) {
+ if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
+ if(any(names(object$W) %in% names(newdata))) newdata <- newdata[!(names(newdata) %in% names(object$W))]
+ NBAGG <- length(object$mtrees)
+ N <- nrow(newdata)
+ classes <- levels(object$y)
+ vote <- matrix(0, nrow=N, ncol=length(classes))
+ for(i in 1:NBAGG) {
+ intermed <- object$mtrees[[i]]$bfct(newdata)
+# XX <- data.frame(newdata, intermed)
+ if(!is.null(object$mtrees[[i]]$btree$fixed.function)) {
+ names(intermed) <- sub(".[0-9]$", "", names(intermed))
+ XX <- data.frame(newdata, intermed)
+# names(XX)[(ncol(XX)-ncol(intermed)+1):ncol(XX)] <- sub(".[0-9]$", "", names(intermed))
+ res <- object$mtrees[[i]]$btree$fixed.function(XX)
+ } else {
+ XX <- data.frame(newdata, intermed)
+ if(is.null(object$mtrees[[i]]$btree$predict)) {
+ res <- try(predict(object$mtrees[[i]]$btree$model, newdata = XX, ...))
+ } else {
+ res <- try(object$mtrees[[i]]$btree$predict(object$mtrees[[i]]$btree$model, newdata = XX, ...))
+ }
+ }
+ res <- cbind(1:N, res)
+ vote[res] <- vote[res] +1
+ }
+
+ RET <- factor(classes[apply(vote, 1, uwhich.max)])
+ RET
+}
diff --git a/R/predict.inclass.R b/R/predict.inclass.R
new file mode 100644
index 0000000..eda85b4
--- /dev/null
+++ b/R/predict.inclass.R
@@ -0,0 +1,37 @@
+# $Id: predict.inclass.R,v 1.19 2003/03/31 08:44:16 peters Exp $
+
+# Additional option type ="class", if intermediate is nominal
+
+predict.inclass <- function(object, newdata, ...)
+{
+ if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
+ q <- length(object$model.intermediate) # number of intermediates
+ namen <- names(object$model.intermediate)
+
+ intermediate <- is.vector(NULL, mode = "NULL")
+
+ for(i in 1:q) {
+ if(!is.null(object$para.intermediate[[i]][["predict"]])) {
+ RET <- object$para.intermediate[[i]][["predict"]](object$model.intermediate[[i]], newdata = newdata, ...)
+ } else {
+ RET <- predict(object$model.intermediate[[i]], newdata = newdata, ...)
+ }
+ intermediate <- data.frame(intermediate, RET)
+ }
+
+ intermediate <- intermediate[,-1]
+ names(intermediate) <- namen
+
+ intermediate <- data.frame(newdata[,!(names(newdata) %in% names(intermediate))], intermediate)
+
+ if(!is.function(object$para.response)) {
+ if(!is.null(object$para.response[["predict"]])) {
+ RET <- object$para.response[["predict"]](object$model.response, newdata = intermediate, ...)
+ } else {
+ RET <- predict(object$model.response, newdata = intermediate, ...)
+ }
+ } else {
+ RET <- object$para.response(intermediate)
+ }
+ return(RET)
+}
diff --git a/R/predict.irpart.R b/R/predict.irpart.R
new file mode 100644
index 0000000..5d55970
--- /dev/null
+++ b/R/predict.irpart.R
@@ -0,0 +1,54 @@
+#
+# a modified version of `predict.rpart.s' from the rpart package
+# see COPYRIGHTS for details.
+#
+predict.irpart <-
+function(object, newdata = list(),
+ type = c("vector", "prob", "class", "matrix"), ...) {
+ if(!inherits(object, "rpart"))
+ stop("Not legitimate tree")
+ mtype <- missing(type)
+ type <- match.arg(type)
+ if(missing(newdata))
+ where <- object$where
+ else {
+ if(is.null(attr(newdata, "terms")) & !inherits(newdata, "rpart.matrix")) {
+ Terms <- delete.response(object$terms)
+ act <- (object$call)$na.action
+ if (is.null(act)) act<- na.rpart
+ newdata <- model.frame(Terms, newdata, na.action = act,
+ xlev=attr(object, "xlevels"))
+ newdata <- getFromNamespace("rpart.matrix", ns = "rpart")(newdata)
+ }
+ where <- getFromNamespace("pred.rpart", ns = "rpart")(object, newdata)
+ }
+ frame <- object$frame
+ method <- object$method
+ ylevels <- attr(object, "ylevels")
+ nclass <- length(ylevels)
+ if(mtype && nclass > 0) type <- "prob"
+ if(type == "vector" || (type=="matrix" && is.null(frame$yval2))) {
+ pred <- frame$yval[where]
+ names(pred) <- names(where)
+ }
+ else if (type == "matrix") {
+ pred <- frame$yval2[where,]
+ dimnames(pred) <- list(names(where), NULL)
+ }
+ else if(type == "class" && nclass > 0) {
+ pred <- factor(ylevels[frame$yval[where]], levels=ylevels)
+ names(pred) <- names(where)
+ }
+ else if (type == "prob" && nclass > 0) {
+ pred <- frame$yval2[where, 1 + nclass + 1:nclass]
+ dimnames(pred) <- list(names(where), ylevels)
+ }
+ else stop("Invalid prediction for rpart object")
+
+ # Expand out the missing values in the result
+ # But only if operating on the original dataset
+ if (missing(newdata) && !is.null(object$na.action))
+ pred <- naresid(object$na.action, pred)
+ pred
+}
+
diff --git a/R/print.R b/R/print.R
new file mode 100644
index 0000000..bb1ef4b
--- /dev/null
+++ b/R/print.R
@@ -0,0 +1,218 @@
+#$Id: print.R,v 1.4 2004/02/09 08:08:21 peters Exp $
+
+print.classbagg <- function(x, digits=4, ...)
+{
+ cat("\n")
+ B <- length(x$mtrees)
+ if (B > 1)
+ method <- paste("Bagging classification trees with", B,
+ "bootstrap replications")
+ else
+ method <- "Classification tree"
+ cat(method, "\n")
+ if (!is.null(x$call)) {
+ cat("\nCall: ")
+ print(x$call)
+ cat("\n")
+ }
+ if (x$OOB) {
+ cat("Out-of-bag estimate of misclassification error: ",
+ round(x$err, digits), "\n")
+ }
+ cat("\n")
+}
+
+print.regbagg <- function(x, digits=4, ...)
+{
+ cat("\n")
+ B <- length(x$mtrees)
+ if (B > 1)
+ method <- paste("Bagging regression trees with", B,
+ "bootstrap replications")
+ else
+ method <- "Regression tree"
+ cat(method, "\n")
+ if (!is.null(x$call)) {
+ cat("\nCall: ")
+ print(x$call)
+ cat("\n")
+ }
+ if (x$OOB)
+ cat("Out-of-bag estimate of root mean squared error: ",
+ round(x$err, digits), "\n")
+ cat("\n")
+
+}
+
+print.survbagg <- function(x, digits=4, ...)
+{
+ cat("\n")
+ B <- length(x$mtrees)
+ if (B > 1)
+ method <- paste("Bagging survival trees with", B,
+ "bootstrap replications")
+ else
+ method <- "Survival tree"
+ cat(method, "\n")
+ if (!is.null(x$call)) {
+ cat("\nCall: ")
+ print(x$call)
+ cat("\n")
+ }
+ if (x$OOB)
+ cat("Out-of-bag estimate of Brier's score: ",
+ round(x$err, digits), "\n")
+ cat("\n")
+
+}
+
+summary.classbagg <- function(object, ...)
+{
+ print(object, ...)
+ class(object) <- "summary.bagging"
+ object
+}
+
+summary.regbagg <- function(object, ...)
+{
+ print(object, ...)
+ class(object) <- "summary.bagging"
+ object
+}
+
+summary.survbagg <- function(object, ...)
+{
+ print(object, ...)
+ class(object) <- "summary.bagging"
+ object
+}
+
+print.summary.bagging <- function(x, digits = max(3, getOption("digits")-3),
+ ...)
+{
+ cat("Trees: \n")
+ print(x$mtrees)
+ invisible(x$mtrees)
+}
+
+print.cvclass <- function(x, digits=4, ...)
+{
+ cat("\n")
+ if (!is.null(x$call)) {
+ cat("Call:\n")
+ print(x$call)
+ cat("\n")
+ }
+ cat("\t", paste(x$k, "-fold", sep=""),
+ "cross-validation estimator of misclassification error \n")
+ cat("\n")
+ cat("Misclassification error: ", round(x$error, digits), "\n")
+ cat("\n")
+}
+
+print.bootestclass <- function(x, digits=4, ...) {
+ if(all(names(x)[names(x)!="call"] %in% c("boot", "632plus"))) {
+ XX <- x
+ for(i in c("boot", "632plus")) {
+ x <- XX[[i]]
+ x$call <- XX[["call"]]
+ cat("\n")
+ if (!is.null(x$call)) {
+ cat("Call:\n")
+ print(x$call)
+ cat("\n")
+ }
+ if (x$bc632plus) {
+ cat("\t", ".632+ Bootstrap estimator of misclassification error \n")
+ } else {
+ cat("\t", "Bootstrap estimator of misclassification error \n")
+ }
+ cat("\t with" , x$nboot, "bootstrap replications\n")
+ cat("\n")
+ cat("Misclassification error: ", round(x$error, digits), "\n")
+ if (!x$bc632plus) cat("Standard deviation:", round(x$sd, digits), "\n")
+ cat("\n")
+ }
+ } else {
+# if(!all(names(x) %in% c("boot", "632plus"))){
+ cat("\n")
+ if (!is.null(x$call)) {
+ cat("Call:\n")
+ print(x$call)
+ cat("\n")
+ }
+ if (x$bc632plus)
+ cat("\t", ".632+ Bootstrap estimator of misclassification error \n")
+ else
+ cat("\t", "Bootstrap estimator of misclassification error \n")
+ cat("\t with" , x$nboot, "bootstrap replications\n")
+ cat("\n")
+ cat("Misclassification error: ", round(x$error, digits), "\n")
+ if (!x$bc632plus)
+ cat("Standard deviation:", round(x$sd, digits), "\n")
+ cat("\n")
+ }
+}
+
+
+
+print.cvreg <- function(x, digits=4, ...)
+{
+ cat("\n")
+ if (!is.null(x$call)) {
+ cat("Call:\n")
+ print(x$call)
+ cat("\n")
+ }
+ cat("\t", paste(x$k, "-fold", sep=""),
+ "cross-validation estimator of root mean squared error\n")
+ cat("\n")
+ cat("Root mean squared error: ", round(x$error, digits), "\n")
+ cat("\n")
+}
+
+print.bootestreg <- function(x, digits=4, ...)
+{
+ cat("\n")
+ if (!is.null(x$call)) {
+ cat("Call:\n")
+ print(x$call)
+ cat("\n")
+ }
+ cat("\t", "Bootstrap estimator of root mean squared error \n")
+ cat("\t with" , x$nboot, "bootstrap replications\n")
+ cat("\n")
+ cat("Root mean squared error: ", round(x$error, digits), "\n")
+ cat("\n")
+}
+
+
+print.cvsurv <- function(x, digits=4, ...)
+{
+ cat("\n")
+ if (!is.null(x$call)) {
+ cat("Call:\n")
+ print(x$call)
+ cat("\n")
+ }
+ cat("\t", paste(x$k, "-fold", sep=""),
+ "cross-validation estimator of Brier's score\n")
+ cat("\n")
+ cat("Brier's score: ", round(x$error, digits), "\n")
+ cat("\n")
+}
+
+print.bootestsurv <- function(x, digits=4, ...)
+{
+ cat("\n")
+ if (!is.null(x$call)) {
+ cat("Call:\n")
+ print(x$call)
+ cat("\n")
+ }
+ cat("\t", "Bootstrap estimator of Brier's score\n")
+ cat("\t with" , x$nboot, "bootstrap replications\n")
+ cat("\n")
+ cat("Brier's score: ", round(x$error, digits), "\n")
+ cat("\n")
+}
diff --git a/R/prune.bagging.R b/R/prune.bagging.R
new file mode 100644
index 0000000..2eb0045
--- /dev/null
+++ b/R/prune.bagging.R
@@ -0,0 +1,23 @@
+# $Id: prune.bagging.R,v 1.2 2002/09/12 08:59:13 hothorn Exp $
+
+prune.classbagg <- function(tree, cp=0.01,...)
+{
+ for(i in 1:length(tree$mtrees))
+ tree$mtrees[[i]]$btree <- prune( tree$mtrees[[i]]$btree, cp=cp, ...)
+ tree
+}
+
+prune.regbagg <- function(tree, cp=0.01,...)
+{
+ for(i in 1:length(tree$mtrees))
+ tree$mtrees[[i]]$btree <- prune( tree$mtrees[[i]]$btree, cp=cp, ...)
+ tree
+}
+
+
+prune.survbagg <- function(tree, cp=0.01,...)
+{
+ for(i in 1:length(tree$mtrees))
+ tree$mtrees[[i]]$btree <- prune( tree$mtrees[[i]]$btree, cp=cp, ...)
+ tree
+}
diff --git a/R/rsurv.R b/R/rsurv.R
new file mode 100644
index 0000000..7b85ccb
--- /dev/null
+++ b/R/rsurv.R
@@ -0,0 +1,56 @@
+# $Id: rsurv.R,v 1.5 2003/03/31 08:44:16 peters Exp $
+
+rsurv <- function(N, model=c("A", "B", "C", "D", "tree"), gamma=NULL, fact=1,
+ pnon=10, gethaz=FALSE)
+{
+ model <- match.arg(model)
+ X <- matrix(runif(N*5), ncol=5)
+ colnames(X) <- paste("X", 1:ncol(X), sep="")
+ switch(model,
+ "A" = {
+ time <- rexp(N)
+ haz <- rep(1, N)
+ },
+ "B" = {
+ hazard <- as.numeric(X[,1] <= 0.5 & X[,2] > 0.5)
+ time <- rexp(N)
+ time[hazard == 1] <- rexp(sum(hazard==1), exp(3))
+ haz <- rep(1, N)
+ haz[hazard == 1] <- exp(3)
+ },
+ "C" = {
+ hazard <- 3*X[,1] + X[,2]
+ haz <- exp(hazard)
+ time <- sapply(haz, rexp, n=1)
+ },
+ "D" = {
+ hazard <- 3*X[,1] - 3*X[,2] + 4*X[,3] - 2*X[,4]
+ haz <- exp(hazard)
+ time <- sapply(haz, rexp, n=1)
+ },
+ "tree" = {
+ hazard <- rep(0, nrow(X))
+ hazard[(X[,1] <= 0.5 & X[,2] <= 0.5)] <- 0
+ hazard[(X[,1] <= 0.5 & X[,2] > 0.5 & X[,4] <= 0.5)] <- 1
+ hazard[(X[,1] <= 0.5 & X[,2] > 0.5 & X[,4] > 0.5)] <- 0
+ hazard[(X[,1] > 0.5 & X[,3] <= 0.5 & X[,5] <= 0.3)] <- 1
+ hazard[(X[,1] > 0.5 & X[,3] <= 0.5 & X[,5] > 0.3)] <- 2
+ hazard[(X[,1] > 0.5 & X[,3] > 0.5 & X[,4] <= 0.7)] <- 2
+ hazard[(X[,1] > 0.5 & X[,3] > 0.5 & X[,4] > 0.7)] <- 3
+ hazard <- hazard * fact
+ haz <- exp(hazard)
+ time <- sapply(haz, rexp, n=1)
+ if (pnon > 0)
+ X <- cbind(X, matrix(runif(N*pnon), ncol=pnon))
+ colnames(X) <- paste("X", 1:ncol(X), sep="")
+ })
+ if (!is.null(gamma))
+ censtime <- runif(N, min=0, max=gamma)
+ else
+ censtime <- Inf
+ cens <- as.numeric(time <= censtime)
+ time <- pmin(time, censtime)
+ simdat <- as.data.frame(cbind(time, cens, X))
+ if (gethaz) attr(simdat, "hazard") <- haz
+ return(simdat)
+}
diff --git a/R/sbrier.R b/R/sbrier.R
new file mode 100644
index 0000000..1f0447b
--- /dev/null
+++ b/R/sbrier.R
@@ -0,0 +1,151 @@
+# $Id: sbrier.R,v 1.5 2009/03/27 16:18:38 hothorn Exp $
+
+getsurv <- function(obj, times)
+{
+ # get the survival probability for times from KM curve `obj'
+
+ if (!inherits(obj, "survfit")) stop("obj is not of class survfit")
+ # <FIXME: methods may have problems with that>
+ class(obj) <- NULL
+ # </FIXME>
+ lt <- length(times)
+ nsurv <- times
+
+ # if the times are the same, return the km-curve
+
+ if(length(times) == length(obj$time)) {
+ if (all(times == obj$time)) return(obj$surv)
+ }
+
+ # otherwise get the km-value for every element of times separatly
+
+ inside <- times %in% obj$time
+ for (i in (1:lt)) {
+ if (inside[i])
+ nsurv[i] <- obj$surv[obj$time == times[i]]
+ else {
+ less <- obj$time[obj$time < times[i]]
+ if (length(less) == 0)
+ nsurv[i] <- 1
+ else
+ nsurv[i] <- obj$surv[obj$time == max(less)]
+ }
+ }
+ nsurv
+}
+
+sbrier <- function(obj, pred, btime = range(obj[,1]))
+{
+ if(!inherits(obj, "Surv"))
+ stop("obj is not of class Surv")
+
+ # check for right censoring
+
+ # <FIXME>
+ class(obj) <- NULL
+ # </FIXME>
+ if (attr(obj, "type") != "right")
+ stop("only right-censoring allowed")
+ N <- nrow(obj)
+
+ # get the times and censoring of the data, order them with resp. to time
+
+ time <- obj[,1]
+ ot <- order(time)
+ cens <- obj[ot,2]
+ time <- time[ot]
+
+ # get the times to compute the (integrated) Brier score over
+
+ if (is.null(btime)) stop("btime not given")
+ if (length(btime) < 1) stop("btime not given")
+
+ if (length(btime) == 2) {
+ if (btime[1] < min(time)) warning("btime[1] is smaller than min(time)")
+ if (btime[2] > max(time)) warning("btime[2] is larger than max(time)")
+ btime <- time[time >= btime[1] & time <=
+ btime[2]]
+ }
+
+ ptype <- class(pred)
+ # <begin> S3 workaround
+ if (is.null(ptype)) {
+ if (is.vector(pred)) ptype <- "vector"
+ if (is.list(pred)) ptype <- "list"
+ }
+ # <end>
+ if (ptype == "numeric" && is.vector(pred)) ptype <- "vector"
+
+ survs <- NULL
+ switch(ptype, survfit = {
+ survs <- getsurv(pred, btime)
+ survs <- matrix(rep(survs, N), nrow=length(btime))
+ }, list = {
+ if (!inherits(pred[[1]], "survfit")) stop("pred is not a list of survfit objects")
+ if (length(pred) != N) stop("pred must be of length(time)")
+ pred <- pred[ot]
+ survs <- matrix(unlist(lapply(pred, getsurv, times = btime)),
+ nrow=length(btime), ncol=N)
+ }, vector = {
+ if (length(pred) != N) stop("pred must be of length(time)")
+ if (length(btime) != 1) stop("cannot compute integrated Brier score with pred")
+ survs <- pred[ot]
+ }, matrix = {
+ # <FIXME>
+ if (all(dim(pred) == c(length(btime), N)))
+ survs <- pred[,ot]
+ else
+ stop("wrong dimensions of pred")
+ # </FIXME>
+ })
+ if (is.null(survs)) stop("unknown type of pred")
+
+ # reverse Kaplan-Meier: estimate censoring distribution
+
+ ### deal with ties
+ hatcdist <- prodlim(Surv(time, cens) ~ 1,reverse = TRUE)
+ csurv <- predict(hatcdist, times = time, type = "surv")
+ csurv[csurv == 0] <- Inf
+ # hatcdist <- survfit(Surv(time, 1 - cens) ~ 1)
+ # csurv <- getsurv(hatcdist, time)
+ # csurv[csurv == 0] <- Inf
+
+ bsc <- rep(0, length(btime))
+
+ # compute Lebesque-integrated Brier score
+
+ if (length(btime) > 1) {
+ for (j in 1:length(btime)) {
+ help1 <- as.integer(time <= btime[j] & cens == 1)
+ help2 <- as.integer(time > btime[j])
+ bsc[j] <- mean((0 - survs[j,])^2*help1*(1/csurv) +
+ (1-survs[j,])^2*help2*(1/csurv[j]))
+ }
+
+ ### apply trapezoid rule
+ idx <- 2:length(btime)
+ RET <- diff(btime) %*% ((bsc[idx - 1] + bsc[idx]) / 2)
+ RET <- RET / diff(range(btime))
+
+ ### previously was
+ #diffs <- c(btime[1], btime[2:length(btime)] -
+ # btime[1:(length(btime)-1)])
+ #RET <- sum(diffs*bsc)/max(btime)
+ names(RET) <- "integrated Brier score"
+ attr(RET, "time") <- range(btime)
+
+ # compute Brier score at one single time `btime'
+
+ } else {
+ help1 <- as.integer(time <= btime & cens == 1)
+ help2 <- as.integer(time > btime)
+ cs <- predict(hatcdist, times=btime, type = "surv")
+ ### cs <- getsurv(hatcdist, btime)
+ if (cs == 0) cs <- Inf
+ RET <- mean((0 - survs)^2*help1*(1/csurv) +
+ (1-survs)^2*help2*(1/cs))
+ names(RET) <- "Brier score"
+ attr(RET, "time") <- btime
+ }
+ RET
+}
diff --git a/R/slda.R b/R/slda.R
new file mode 100644
index 0000000..11a447a
--- /dev/null
+++ b/R/slda.R
@@ -0,0 +1,112 @@
+# $Id: slda.R,v 1.9 2005/06/29 08:50:28 hothorn Exp $
+
+# stabilized linear discriminant analysis according to Laeuter & Kropf
+
+slda <- function(y, ...) UseMethod("slda")
+
+slda.default <- function(y, ...)
+ stop(paste("Do not know how to handle objects of class", class(data)))
+
+slda.formula <- function(formula, data, subset, na.action=na.rpart, ...) {
+ cl <- match.call()
+ if(missing(formula)
+ || (length(formula) != 3)
+ || (length(attr(terms(formula[-2], data = data), "term.labels")) < 1)
+ || (length(attr(terms(formula[-3], data = data), "term.labels")) != 1))
+ stop("formula missing or incorrect")
+ m <- match.call(expand.dots = FALSE)
+ if(is.matrix(eval(m$data, parent.frame())))
+ m$data <- as.data.frame(data)
+ m[[1]] <- as.name("model.frame")
+ m$... <- NULL
+ m <- eval(m, parent.frame())
+ Terms <- attr(m, "terms")
+ grouping <- model.extract(m, "response")
+ x <- model.matrix(Terms, m)
+ xvars <- as.character(attr(Terms, "variables"))[-1]
+ if ((yvar <- attr(Terms, "response")) > 0) xvars <- xvars[-yvar]
+ xlev <- if (length(xvars) > 0) {
+ xlev <- lapply(m[xvars], levels)
+ xlev[!sapply(xlev, is.null)]
+ }
+ xint <- match("(Intercept)", colnames(x), nomatch=0)
+ if(xint > 0) x <- x[, -xint, drop=FALSE]
+ RET <- slda(y=grouping, X=x, ...)
+ RET$terms <- Terms
+ RET$call <- match.call()
+ RET$contrasts <- attr(x, "contrasts")
+ RET$xlevels <- xlev
+ attr(RET, "na.message") <- attr(m, "na.message")
+ if(!is.null(attr(m, "na.action"))) RET$na.action <- attr(m, "na.action")
+ RET
+}
+
+
+slda.factor <- function(y, X, q=NULL, ...) {
+
+ p <- ncol(X)
+ # substract global mean
+ Xnull <- X - apply(X, 2, mean)
+ if (!is.null(q)) {
+ if (floor(q) != q) stop("q is not an integer")
+ if (q > p) {
+ q <- p
+ warning("q is greater ncol(X), using q = ncol(X)")
+ }
+ if (q < 1) {
+ q <- 1
+ warning("q is less than 1, using q = 1")
+ }
+ }
+
+ # this is S_0 in Kropf (2000)
+ Snull <- cov(Xnull)
+ ewp <- svd(solve(diag(diag(Snull), ncol = ncol(Snull)))%*%Snull)
+ if (!is.complex(ewp$d)) {
+ # determine q by the number of eigenvalues > 1
+ if (is.null(q)) q <- sum(ewp$d > 1)
+ D <- ewp$v[,1:q]
+ if (q == 1) D <- as.matrix(D)
+ # Xstab is still spherically distributed (Fang & Zhang, Laeuter, Kropf &
+ # Glimm)!
+ } else {
+ D <- diag(p)
+ }
+ Xstab <- as.matrix(X) %*% D
+ colnames(Xstab) <- paste("V", 1:ncol(Xstab), sep="")
+ mylda <- lda(Xstab, grouping = y, ...)
+ RET <- list(scores = D, mylda = mylda)
+ class(RET) <- "slda"
+ RET
+}
+
+predict.slda <- function(object, newdata, ...) {
+ if(!inherits(object, "slda")) stop("object not of class slda")
+ if(!is.null(Terms <- object$terms)) { #
+ # formula fit (only)
+ if(missing(newdata)) newdata <- model.frame(object)
+ else {
+ newdata <- model.frame(as.formula(delete.response(Terms)),
+ newdata, na.action=function(x) x,
+ xlev = object$xlevels)
+ }
+ x <- model.matrix(delete.response(Terms), newdata,
+ contrasts = object$contrasts)
+ xint <- match("(Intercept)", colnames(x), nomatch=0)
+ if(xint > 0) x <- x[, -xint, drop=FALSE]
+ } else {
+ stop("object has no terms element")
+ }
+ if(ncol(x) != nrow(object$scores)) stop("wrong number of variables")
+# <FIXME>: check for variable names!
+# if(length(colnames(x)) > 0 &&
+# any(colnames(x) != dimnames(object$means)[[2]]))
+# warning("Variable names in newdata do not match those in object")
+# </FIXME>
+ X <- x %*% object$scores
+ if (inherits(object$mylda, "lda"))
+ return(predict(object$mylda, newdata=as.data.frame(X), ...))
+ else
+ stop(paste("Do not know how to predict from objects of class", class(object$mylda)))
+
+}
diff --git a/R/ssubset.R b/R/ssubset.R
new file mode 100644
index 0000000..221261c
--- /dev/null
+++ b/R/ssubset.R
@@ -0,0 +1,39 @@
+
+ssubset <- function(y, k, strat=TRUE) {
+ if (!is.factor(y)) stop("y is not of class factor")
+ N <- length(y)
+ nlevel <- table(y)
+ nindx <- list()
+ indx <- 1:N
+ outindx <- list()
+ if (strat) {
+ for (j in 1:length(nlevel))
+ nindx <- c(nindx, list(indx[which(y == levels(y)[j])]))
+ kmat <- kfoldcv(k, N, nlevel)
+ for (i in 1:k) {
+ sset <- kmat[,i]
+ kindx <- c()
+ for (j in 1:length(nlevel)) {
+ if (i > 1)
+ kindx <- c(kindx, nindx[[j]][(sum(kmat[j,
+ 1:(i-1)])+1):sum(kmat[j,1:i])])
+ else
+ kindx <- c(kindx, nindx[[j]][1:kmat[j,1]])
+ }
+ kindx <- kindx[!is.na(kindx)]
+ outindx <- c(outindx, list(kindx))
+ }
+ return(outindx)
+ } else {
+ kmat <- kfoldcv(k, N)
+ nindx <- indx
+ for (i in 1:k) {
+ if (i > 1)
+ outindx <- c(outindx,
+ list(nindx[(sum(kmat[1:(i-1)])+1):sum(kmat[1:i])]))
+ else
+ outindx <- c(outindx, list(nindx[1:kmat[1]]))
+ }
+ }
+ return(outindx)
+}
diff --git a/R/varset.R b/R/varset.R
new file mode 100644
index 0000000..816e392
--- /dev/null
+++ b/R/varset.R
@@ -0,0 +1,26 @@
+# $Id: varset.R,v 1.2 2002/03/26 16:29:15 hothorn Exp $
+
+varset <- function(N, sigma = 0.1, theta = 90, threshold = 0, u = 1:3)
+{
+ # create U
+ U <- matrix(rep(0, 4), ncol = 2)
+ U[1, 1] <- u[1]
+ U[1, 2] <- u[2]
+ U[2, 1] <- u[3]
+ U[2, 2] <- (theta-u[1]*u[3])/u[2]
+ lambda <- sqrt(U[1, 1]^2 + U[1, 2]^2)
+ U[1, ] <- U[1, ]/lambda
+ lambda <- sqrt(U[2, 1]^2 + U[2, 2]^2)
+ U[2, ] <- U[2, ]/lambda
+
+ e <- matrix(rnorm(2*N, sd = sigma), ncol = 2, byrow = TRUE)
+ expl <- matrix(rnorm(2*N), ncol = 2, byrow = TRUE)
+ inter <- t(U %*%t(expl) + t(e))
+ z <- (inter > threshold)
+ resp <- as.factor(ifelse((z[,1] + z[,2]) > 1, 1, 0))
+ colnames(expl) <- c("x1", "x2")
+ colnames(inter) <- c("y1", "y2")
+
+ result <- list(explanatory = expl, intermediate = inter, response = resp)
+ return(result)
+}
diff --git a/build/vignette.rds b/build/vignette.rds
new file mode 100644
index 0000000..da26a55
Binary files /dev/null and b/build/vignette.rds differ
diff --git a/cleanup b/cleanup
new file mode 100755
index 0000000..7947690
--- /dev/null
+++ b/cleanup
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+for f in ./R/*~; do
+ rm -f $f
+done
+
+for f in ./tests/*~; do
+ rm -f $f
+done
+
+for f in .*~; do
+ rm -f $f
+done
+
+for f in ./man/*~; do
+ rm -f $f
+done
+
+for f in ./data/*~; do
+ rm -f $f
+done
+
+for f in *~; do
+ rm -f $f
+done
+
+find . -name "DEADJOE" -exec rm -f {} \;
+
+exit 0
diff --git a/data/DLBCL.rda b/data/DLBCL.rda
new file mode 100644
index 0000000..714d8af
Binary files /dev/null and b/data/DLBCL.rda differ
diff --git a/data/GlaucomaMVF.rda b/data/GlaucomaMVF.rda
new file mode 100644
index 0000000..9d5de58
Binary files /dev/null and b/data/GlaucomaMVF.rda differ
diff --git a/data/Smoking.rda b/data/Smoking.rda
new file mode 100644
index 0000000..7143e49
Binary files /dev/null and b/data/Smoking.rda differ
diff --git a/data/dystrophy.rda b/data/dystrophy.rda
new file mode 100644
index 0000000..9d7080e
Binary files /dev/null and b/data/dystrophy.rda differ
diff --git a/inst/COPYRIGHTS b/inst/COPYRIGHTS
new file mode 100644
index 0000000..990e52c
--- /dev/null
+++ b/inst/COPYRIGHTS
@@ -0,0 +1,26 @@
+COPYRIGHT STATUS
+----------------
+
+The bulk of this code is
+
+ Copyright (C) 2002-2012 Andrea Peters and Torsten Hothorn
+
+except the code in
+
+ .R/irpart.R
+ .R/predict.irpart.R
+
+which are modifications from the files
+
+ rpart.s and predict.rpart.s
+
+from package `rpart', version 3.1-8 which is
+
+ Copyright (C) 2000 Mayo Foundation for Medical Education and
+ Research
+
+with modifications for R by Brian D. Ripley.
+
+All code is subject to the GNU General Public License, Version 2. See
+the file COPYING for the exact conditions under which you may
+redistribute it.
diff --git a/inst/NEWS b/inst/NEWS
new file mode 100644
index 0000000..06776b5
--- /dev/null
+++ b/inst/NEWS
@@ -0,0 +1,285 @@
+# $Id: CHANGES,v 1.48 2009/09/09 15:40:28 hothorn Exp $
+
+ 0.9-6 (01.03.2017)
+
+ register C routines
+
+ 0.9-5 (28.07.2015)
+
+ fix NAMESPACE
+
+ 0.9-4 (20.02.2015)
+
+ register predict.ipredknn
+
+ 0.9-3 (20.12.2013)
+
+ use trapezoid rule to compute integrated Brier score in sbrier
+
+ 0.9-2 (02.09.2013)
+
+ NAMESPACE issues, TH.data
+
+ 0.9-0 (22.10.2012)
+
+ Due to interface changes in rpart 3.1-55, the
+ bagging function had to be rewritten. Results of previous version are not
+ exactly reproducible.
+
+ 0.8-13 (21.02.2012)
+
+ import(survival)
+
+ 0.8-12 (20.02.2012)
+
+ use prodlim to compute censoring distributions in sbrier (makes a
+ difference for tied survival times)
+
+ GPL (>= 2) and no require in .onLoad
+
+ 0.8-11 (08.02.2011)
+
+ depends R >= 2.10
+
+ 0.8-10 (02.02.2011)
+
+ compress data files
+
+ 0.8-9 (27.01.2011)
+
+ fix nrow problem in sbrier, spotted by Phil Boonstra <philb at umich.edu>
+
+ avoid partial matches of function arguments
+
+ 0.8-8 (09.09.2009)
+
+ documentation fixes
+
+ 0.8-7 (27.03.2009)
+
+ survival fixes
+
+ 0.8-6 (28.07.2008)
+
+ make R devel happy (<function>$<name> is no longer allowed)
+
+ 0.8-4 (09.10.2007)
+
+ change maintainer
+
+ 0.8-3 (29.06.2005)
+
+ terms(formula, data) needs `data' argument (suggested by BDR).
+
+ 0.8-2 (09.12.2004)
+
+ - slda: correct for one explanatory variable:
+ ewp <- svd(solve(diag(diag(Snull), ncol = ncol(Snull)))%*%Snull)
+ ^^^^^^^^^^^^^
+ 0.8-1 (25.11.2004)
+
+ - change #!/bin/bash -> #!/bin/sh
+
+ 0.8-0 (02.06.2004)
+
+ - correction of NAMESPACES
+
+ 0.7-9 (13.05.2004)
+
+ -description file, insert suggests: mvtnorm
+
+ 0.7-8 (21.04.2004)
+
+ - don't run selected examples and ipred-tests.R
+
+ 0.7-7 (02.02.2004)
+
+ -return predicted values for error estimations
+ "boot" and "632plus" if required
+ -optional argument determining which observations
+ are incuded in each sample within 'errorest'
+ -"boot" and "632plus" can be computed simultanously
+
+ 0.7-6 (16.01.2004)
+
+ fix tests/ipred-segfault
+
+ 0.7-5 (19.12.2003)
+
+ examples of inbagg and predict.inbagg (don't use mvtnorm)
+
+ 0.7-4 (16.12.2003)
+
+ R-1.9.0 fixes
+
+ 0.7-3 (03.11.2003)
+
+ fix documentation bugs found by `codoc'
+
+ 0.7-2 (29.10.2003)
+
+ `rpart' is sensitive to compilers / optimization flags:
+ the results we compare `ipred's tests with are produced with
+ an optimized version of `rpart' (gcc -O2).
+
+ `eigen' in `slda' replaced by `svd'
+
+ 0.7-1 (08.08.2003)
+
+ adapt to changes in R-devel and lda (package MASS)
+
+ 0.7-0 (08.08.2003)
+
+ add namespaces
+
+ 0.6-15 (----)
+
+ new argument "getmodels=TRUE" to cv: the returned object has an
+ element "models", a list which contains the models for each fold.
+
+ new interface for inclass and adding method inbagg.
+
+ 0.6-14 (13.03.2003)
+
+ clean up bagging.Rd
+
+ 0.6-12 (12.03.2003)
+
+ methods for "integer" for the generics "bagging", "cv" and "bootest"
+
+ do not call methods to generics directly, since they may be hidded
+ (because not exported: predict.lda)
+
+ 0.6-11 (05.03.2003)
+
+ 632plus was false when the no-information error rate was less than
+ the raw bootstrap estimator (eq. 29 was used instead of eq. 32 in
+ Efron & Tibshirani, 1997). Thanks to Ramon Diaz <rdiaz at cnio.es> for
+ reporting.
+
+ changed the RNGkind to
+ RNGkind("Wichmann-Hill", "Kinderman-Ramage")
+ or RNGversion("1.6.2") making the regression tests pass R CMD check
+ with R-devel (1.7.0)
+
+ ipred is allowed to import rpart.{anova, class, exp, poisson,
+ matrix} from package rpart, thanks to BDR.
+
+ 0.6-9 (25.02.2003)
+
+ the terms attribute of data in errorest.data.frame may cause problems
+ with some predict methods -> deleted
+
+ 0.6-7 (17.02.2003)
+
+ use a formula / data framework in cv and bootest.
+
+ "model" now deals with the original variable names
+ (and formula) instead of "y" and "X".
+
+ "model" is now allowed to return a function with newdata argument for
+ prediction. This is especially useful for estimating the error of
+ both variable selection and model building simultaneously, the
+ vignette gives a simple example.
+
+ cv.numeric and bootest.numeric were broken and gave faulty estimates
+ of MSE, both problems fixed
+
+ if the maximum of votes for any class is not unique, the class is
+ choosen by random in predict.classbagg now. Formerly, the class with
+ lowest level was choosen by mistake.
+
+ 0.6-6 (06.02.2003)
+
+ fixes required by attached "methods" package
+
+ 0.6-4 (18.12.2002)
+
+ R CMD build problems
+
+ 0.6-3 (03.12.2002)
+
+ cv in errorest did faultly use all observations for estimating the
+ error which lead to over optimistic results
+
+ 0.6-2 (18.10.2002)
+
+ documentation updates and copyright status added
+
+ 0.6-1 (02.10.2002)
+
+ documentation fixes
+
+ 0.6-0 (27.09.2002)
+
+ added vignette
+ documentation updates
+
+ 0.5-7 (23.09.2002)
+
+ add internal functions irpart and predict.irpart
+ for speeding up standard bagging
+
+ use error.control for the specification of control
+ parameters
+
+ cv can be used to caculcate an "honest" prediction for each
+ observation
+
+ 0.5-6 (12.09.2002)
+
+ code factors in GBSG2 data as factors. Documentation update. Add
+ keepX argument to ipredbagg
+
+ 0.5-5 (10.09.2002)
+
+ set rpart.control(..., xval=0) by default
+
+ 0.5-4 (05.08.2002)
+
+ added k-NN with formula interface and stabilized LDA
+
+ 0.5-3 (01.08.2002)
+
+ use rpart.control() for regression and survival
+ new documentation for bagging and friends
+
+ 0.5-2 (30.07.2002)
+
+ new low-level functions cv and bootest for error rate estimators
+ (misclassification, mse, brier score)
+
+ 0.5-1 (25.07.2002)
+
+ bagging code completely rewritten
+
+ 0.4-6 (27.06.2002)
+
+ out-of-bag error for regression trees fixed.
+
+ 0.4-5 (17.06.2002)
+
+ use "minsplit = 2" in `rpart.control' passed to `bagging'
+
+ 0.4-4 (17.05.2002)
+
+ use predict.lda in bagging and predict.bagging
+ bagging(..., method="double") did not work for factors.
+
+ 0.4-3 (07.05.2002)
+
+ bugfix in bagging (in models with one regressor),
+ changes in documentation errorest
+
+ 0.4-2 (10.04.2002)
+
+ predict.bagging much faster, OOB much faster
+
+ 0.4-1 (08.04.2002)
+
+ bugfix in print.inclass, predict.inclass
+
+ 0.4-0 (26.03.2002)
+
+ pre-release for CRAN/devel
+
diff --git a/inst/doc/ipred-examples.R b/inst/doc/ipred-examples.R
new file mode 100644
index 0000000..68de92d
--- /dev/null
+++ b/inst/doc/ipred-examples.R
@@ -0,0 +1,144 @@
+### R code from vignette source 'ipred-examples.Rnw'
+
+###################################################
+### code chunk number 1: preliminaries
+###################################################
+options(prompt=">", width=50)
+set.seed(210477)
+
+
+###################################################
+### code chunk number 2: bagging
+###################################################
+library("ipred")
+library("rpart")
+library("MASS")
+data("GlaucomaM", package="TH.data")
+gbag <- bagging(Class ~ ., data = GlaucomaM, coob=TRUE)
+
+
+###################################################
+### code chunk number 3: print-bagging
+###################################################
+print(gbag)
+
+
+###################################################
+### code chunk number 4: double-bagging
+###################################################
+scomb <- list(list(model=slda, predict=function(object, newdata)
+ predict(object, newdata)$x))
+gbagc <- bagging(Class ~ ., data = GlaucomaM, comb=scomb)
+
+
+###################################################
+### code chunk number 5: predict.bagging
+###################################################
+predict(gbagc, newdata=GlaucomaM[c(1:3, 99:102), ])
+
+
+###################################################
+### code chunk number 6: indirect.formula
+###################################################
+data("GlaucomaMVF", package="ipred")
+GlaucomaMVF <- GlaucomaMVF[,-63]
+formula.indirect <- Class~clv + lora + cs ~ .
+
+
+###################################################
+### code chunk number 7: indirect.fit
+###################################################
+classify <- function (data) {
+ attach(data)
+ res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >=
+ 49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) &
+ clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) &
+ !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) |
+ (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
+ detach(data)
+ factor (res, labels = c("glaucoma", "normal"))
+}
+fit <- inclass(formula.indirect, pFUN = list(list(model = lm)),
+ cFUN = classify, data = GlaucomaMVF)
+
+
+###################################################
+### code chunk number 8: print.indirect
+###################################################
+print(fit)
+
+
+###################################################
+### code chunk number 9: predict.indirect
+###################################################
+predict(object = fit, newdata = GlaucomaMVF[c(1:3, 86:88),])
+
+
+###################################################
+### code chunk number 10: bagging.indirect
+###################################################
+mypredict.rpart <- function(object, newdata) {
+ RES <- predict(object, newdata)
+ RET <- rep(NA, nrow(newdata))
+ NAMES <- rownames(newdata)
+ RET[NAMES %in% names(RES)] <- RES[NAMES[NAMES %in% names(RES)]]
+ RET
+}
+fit <- inbagg(formula.indirect, pFUN = list(list(model = rpart, predict =
+mypredict.rpart)), cFUN = classify, nbagg = 25, data = GlaucomaMVF)
+
+
+###################################################
+### code chunk number 11: plda
+###################################################
+mypredict.lda <- function(object, newdata){
+ predict(object, newdata = newdata)$class
+}
+
+
+###################################################
+### code chunk number 12: cvlda
+###################################################
+errorest(Class ~ ., data= GlaucomaM,
+ model=lda, estimator = "cv", predict= mypredict.lda)
+
+
+###################################################
+### code chunk number 13: cvindirect
+###################################################
+errorest(formula.indirect,
+ data = GlaucomaMVF, model = inclass,
+ estimator = "632plus",
+ pFUN = list(list(model = lm)), cFUN = classify)
+
+
+###################################################
+### code chunk number 14: varsel-def
+###################################################
+mymod <- function(formula, data, level=0.05) {
+ # select all predictors that are associated with an
+ # univariate t.test p-value of less that level
+ sel <- which(lapply(data, function(x) {
+ if (!is.numeric(x))
+ return(1)
+ else
+ return(t.test(x ~ data$Class)$p.value)
+ }) < level)
+ # make sure that the response is still there
+ sel <- c(which(colnames(data) %in% "Class"), sel)
+ # compute a LDA using the selected predictors only
+ mod <- lda(formula , data=data[,sel])
+ # and return a function for prediction
+ function(newdata) {
+ predict(mod, newdata=newdata[,sel])$class
+ }
+}
+
+
+###################################################
+### code chunk number 15: varsel-comp
+###################################################
+errorest(Class ~ . , data=GlaucomaM, model=mymod, estimator = "cv",
+est.para=control.errorest(k=5))
+
+
diff --git a/inst/doc/ipred-examples.Rnw b/inst/doc/ipred-examples.Rnw
new file mode 100644
index 0000000..1f98dc1
--- /dev/null
+++ b/inst/doc/ipred-examples.Rnw
@@ -0,0 +1,401 @@
+\documentclass[11pt]{article}
+\usepackage[round]{natbib}
+\usepackage{bibentry}
+\usepackage{amsfonts}
+\usepackage{hyperref}
+\renewcommand{\baselinestretch}{1.3}
+\newcommand{\ipred}{\texttt{ipred }}
+
+%\VignetteIndexEntry{Some more or less useful examples for illustration.}
+%\VignetteDepends{ipred}
+%\textwidth=6.2in
+%\VignetteDepends{mvtnorm,TH.data,rpart,MASS}
+
+\begin{document}
+\title{\ipred: Improved Predictors}
+\date{}
+\SweaveOpts{engine=R,eps=TRUE,pdf=TRUE}
+
+<<preliminaries,echo=FALSE>>=
+options(prompt=">", width=50)
+set.seed(210477)
+@
+
+\maketitle
+
+This short manual is heavily based on
+\cite{Rnews:Peters+Hothorn+Lausen:2002} and needs some improvements.
+
+\section{Introduction}
+In classification problems, there are several attempts to create rules which assign future observations to
+certain classes. Common methods are for
+example linear discriminant analysis or
+classification trees. Recent developments lead to substantial reduction of misclassification error
+in many applications.
+Bootstrap aggregation \citep[``bagging'',][]{breiman:1996} combines
+classifiers trained on bootstrap samples of the original data. Another
+approach is indirect classification, which
+incorporates a priori knowledge
+into a classification rule \citep{hand:2001}.
+Since the misclassification error is a criterion to assess the
+classification techniques, its estimation is of main importance.
+A nearly unbiased but highly variable estimator can be calculated by cross validation. \cite{efron:1997} discuss bootstrap
+estimates of misclassification error.
+As a by-product of bagging, \cite{out-of-bag:1996} proposes the out-of-bag
+estimator. \\
+However, the calculation of the desired classification models and
+their misclassification errors is often aggravated by different and
+specialized interfaces of the various procedures. We propose the \ipred
+package as a first attempt to create a unified interface for improved predictors and various error rate estimators.
+In the following we demonstrate the functionality of the package
+in the example of glaucoma classification. We start with an overview
+about the disease and data and review the implemented
+classification and estimation methods in context with their
+application to glaucoma diagnosis.
+
+
+\section{Glaucoma}
+Glaucoma is a slowly processing and irreversible disease that affects
+the optic nerve head. It is the second most reason for blindness worldwide.
+Glaucoma is usually diagnosed based on a reduced visual field,
+assessed by a medical examination of perimetry and a smaller number of
+intact nerve fibers at the optic nerve head. One opportunity to examine
+the amount of intact nerve fibers is using the Heidelberg Retina
+Tomograph (HRT), a confocal laser scanning tomograph, which does a
+three dimensional topographical analysis of the optic nerve head morphology.
+
+It produces a series of $32$ images, each of $256 \times 256$ pixels,
+which are converted to a single topographic image. A less complex,
+but although a less informative examination tool is the $2$-dimensional
+fundus photography. However, in cooperation with clinicians and a
+priori analysis we derived a diagnosis of glaucoma based on three variables
+only: $w_{lora}$ represents the loss of nerve fibers and is obtained by a
+$2$-dimensional fundus photography, $w_{cs}$ and $w_{clv}$ describe the
+visual field defect \citep{ifcs:2001}.
+
+\begin{center}
+\begin{figure}[h]
+\begin{center}
+{\small
+\setlength{\unitlength}{0.6cm}
+\begin{picture}(14.5,5)
+ \put(5, 4.5){\makebox(2, 0.5){$w_{clv}\geq 5.1$}}
+ \put(2.5, 3){\makebox(2, 0.5){$w_{lora}\geq 49.23$}}
+ \put(7.5, 3){\makebox(2, 0.5){$w_{lora} \geq 58.55$}}
+\put(0, 1.5){\makebox(2, 0.5){$glaucoma$}}
+ \put(3.5, 1.5){\makebox(2, 0.5){$normal$}}
+ \put(6.5, 1.5){\makebox(2, 0.5){$w_{cs} < 1.405$}}
+ \put(10, 1.5){\makebox(2, 0.5){$normal$}}
+
+ \put(3.5, 0){\makebox(2, 0.5){$glaucoma$}}
+ \put(6.5, 0){\makebox(2, 0.5){$normal$}}
+
+ \put(6, 4.5){\vector(-3, -2){1.5}}
+ \put(6, 4.5){\vector(3, -2){1.5}}
+
+ \put(3.5, 3){\vector(3, -2){1.5}}
+ \put(3.5, 3){\vector(-3, -2){1.5}}
+ \put(8.5, 3){\vector(3, -2){1.5}}
+ \put(8.5, 3){\vector(-3, -2){1.5}}
+
+ \put(6.5, 1.5){\vector(3, -2){1.5}}
+ \put(6.5, 1.5){\vector(-3, -2){1.5}}
+\end{picture}
+}
+\end{center}
+\caption{Glaucoma diagnosis. \label{diag}}
+\end{figure}
+\end{center}
+
+Figure \ref{diag} represents the diagnosis of glaucoma in terms of a medical
+decision tree. A complication of the disease is that a damage in the
+optic nerve head morphology precedes a measurable
+visual field defect. Furthermore, an early detection
+is of main importance, since an adequate therapy can only slow down the
+progression of the disease. Hence, a classification rule for detecting
+early damages should include morphological informations, rather than
+visual field data only.
+
+Two example datasets are included in the package. The first one contains
+measurements of the eye morphology only (\texttt{GlaucomaM}), including $62$
+variables for $196$ observations. The second dataset (\texttt{GlaucomaMVF})
+contains additional visual field measurements for a different set of
+patients. In both example datasets, the observations in the two groups are
+matched by age and sex to prevent any bias.
+
+\section{Bagging}
+Referring to the example of glaucoma diagnosis we first
+demonstrate the functionality of the \texttt{bagging} function.
+We fit \texttt{nbagg = 25} (default) classification trees for bagging by
+<<bagging,echo=TRUE>>=
+library("ipred")
+library("rpart")
+library("MASS")
+data("GlaucomaM", package="TH.data")
+gbag <- bagging(Class ~ ., data = GlaucomaM, coob=TRUE)
+@
+where \texttt{GlaucomaM} contains explanatory HRT variables
+and the response of glaucoma diagnosis (\texttt{Class}),
+a factor at two levels \texttt{normal} and \texttt{glaucoma}.
+\texttt{print} returns informations about the returned object,
+i.e. the number of bootstrap replications used and, as requested by
+\texttt{coob=TRUE}, the out-of-bag estimate of misclassification error
+\citep{out-of-bag:1996}.
+<<print-bagging, echo=TRUE>>=
+print(gbag)
+@
+The out-of-bag estimate uses the observations which are left out in a
+bootstrap sample to estimate the misclassification error at almost no
+additional computational costs.
+\cite{double-bag:2002} propose to use the
+out-of-bag samples for a combination of linear discriminant analysis and
+classification trees, called ``Double-Bagging''. For example, a combination
+of a stabilised linear disciminant analysis with classification trees can be
+computed along the following lines
+<<double-bagging, echo=TRUE>>=
+scomb <- list(list(model=slda, predict=function(object, newdata)
+ predict(object, newdata)$x))
+gbagc <- bagging(Class ~ ., data = GlaucomaM, comb=scomb)
+@
+\texttt{predict} predicts future observations according to the
+fitted model.
+<<predict.bagging, echo=TRUE>>=
+predict(gbagc, newdata=GlaucomaM[c(1:3, 99:102), ])
+@
+Both \texttt{bagging} and \texttt{predict} rely on the \texttt{rpart}
+routines. The \texttt{rpart} routine for each bootstrap sample
+can be controlled in the usual way. By default \texttt{rpart.control} is used
+with \texttt{minsize=2} and \texttt{cp=0} and it is wise to turn
+cross-validation off (\texttt{xval=0}). The function \texttt{prune} can
+be used to prune each of the trees to an
+appropriate size.
+
+\section{Indirect Classification}
+Especially in a medical context it often occurs that a priori
+knowledge about a classifying structure is given. For example
+it might be known that a disease is assessed on a subgroup of
+the given variables or, moreover, that class memberships are
+assigned by a deterministically known classifying function.
+\cite{hand:2001} proposes the framework of indirect classification
+which incorporates this a priori knowledge into a classification rule.
+In this framework we subdivide a given data set into three groups of
+variables: those to be used predicting the class membership
+(explanatory), those to be used defining the class membership
+(intermediate) and the class membership variable itself (response).
+For future observations, an indirect classifier predicts values
+for the appointed intermediate variables based
+on explanatory variables only. The observation is classified
+based on their predicted intermediate variables and a fixed
+classifying function. This indirect way of classification using
+the predicted intermediate variables offers possibilities to
+incorporate a priori knowledge by the subdivision of variables and
+by the construction of a fixed classifying function.
+
+We apply indirect classification by using the function \texttt{inclass}.
+Referring to the glaucoma example, explanatory variables are HRT
+and anamnestic variables only, intermediate variables
+are $w_{lora}, \, w_{cs}$ and $w_{clv}$. The response is the
+diagnosis of glaucoma which is determined by a fixed classifying
+function and therefore not included in the learning
+sample \texttt{GlaucomaMVF}. We assign the given variables to explanatory
+and intermediate by specifying the input formula.
+<<indirect.formula, echo=TRUE>>=
+data("GlaucomaMVF", package="ipred")
+GlaucomaMVF <- GlaucomaMVF[,-63]
+formula.indirect <- Class~clv + lora + cs ~ .
+@
+The variables on the left-hand side represent the intermediate variables,
+modeled by the explanatory variables on the right-hand side. Almost each
+modeling technique can be used to predict the intermediate variables. We
+chose a linear model by \texttt{pFUN = list(list(model = lm))}.
+<<indirect.fit, echo=TRUE>>=
+classify <- function (data) {
+ attach(data)
+ res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >=
+ 49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) &
+ clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) &
+ !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) |
+ (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
+ detach(data)
+ factor (res, labels = c("glaucoma", "normal"))
+}
+fit <- inclass(formula.indirect, pFUN = list(list(model = lm)),
+ cFUN = classify, data = GlaucomaMVF)
+@
+\texttt{print} displays the subdivision of variables and the chosen
+modeling technique
+<<print.indirect, echo=TRUE>>=
+print(fit)
+@
+Furthermore, indirect classification predicts the intermediate
+variables based on the explanatory variables and classifies them
+according to a fixed classifying function in a second step, that means
+a deterministically known function for the class membership has to be
+specified. In our example this function is given in
+Figure \ref{diag} and implemented in the function \texttt{classify}.\\
+Prediction of future observations is now performed by
+<<predict.indirect, echo=TRUE>>=
+predict(object = fit, newdata = GlaucomaMVF[c(1:3, 86:88),])
+@
+We perform a bootstrap aggregated indirect classification approach by
+choosing \texttt{pFUN = bagging} and specifying the number of
+bootstrap samples \citep{ifcs:2001}. Regression or classification
+trees are fitted for each bootstrap sample, with respect to the
+measurement scale of the specified intermediate variables
+<<bagging.indirect, echo=TRUE>>=
+mypredict.rpart <- function(object, newdata) {
+ RES <- predict(object, newdata)
+ RET <- rep(NA, nrow(newdata))
+ NAMES <- rownames(newdata)
+ RET[NAMES %in% names(RES)] <- RES[NAMES[NAMES %in% names(RES)]]
+ RET
+}
+fit <- inbagg(formula.indirect, pFUN = list(list(model = rpart, predict =
+mypredict.rpart)), cFUN = classify, nbagg = 25, data = GlaucomaMVF)
+@
+The call for the prediction of values remains unchanged.
+
+
+\section{Error Rate Estimation}
+Classification rules are usually assessed by their misclassification rate.
+Hence, error rate estimation is of main importance.
+The function \texttt{errorest} implements a unified interface to several
+resampling based estimators. Referring to the example, we apply a linear
+discriminant analysis and specify the error rate estimator
+by \texttt{estimator = "cv", "boot"} or \texttt{"632plus"},
+respectively. A 10-fold cross validation is performed by
+choosing \texttt{estimator = "cv"} and
+\texttt{est.para = control.errorest(k = 10)}. The options \texttt{estimator = "boot"} or
+\texttt{estimator = "632plus"} deliver a bootstrap estimator
+and its bias corrected version {\sl .632+} \citep[see][]{efron:1997},
+we specify the number of bootstrap samples to be drawn by
+\texttt{est.para = control.errorest(nboot = 50)}.
+Further arguments are required to particularize the
+classification technique. The argument \texttt{predict} represents
+the chosen predictive function. For a unified interface
+\texttt{predict} has to be based on the arguments \texttt{object}
+and \texttt{newdata} only, therefore a wrapper function \texttt{mypredict} is necessary for classifiers
+which require more than those arguments or do not return the predicted
+classes by default. For a linear discriminant analysis with \texttt{lda}, we
+need to specify
+<<plda, echo=TRUE>>=
+mypredict.lda <- function(object, newdata){
+ predict(object, newdata = newdata)$class
+}
+@
+and calculate a 10-fold-cross-validated error rate estimator
+for a linear discriminant analysis by calling
+<<cvlda, echo=TRUE>>=
+errorest(Class ~ ., data= GlaucomaM,
+ model=lda, estimator = "cv", predict= mypredict.lda)
+@
+For the indirect approach the specification of the call becomes
+slightly more complicated.
+%Again for a unified interface a wrapper
+%function has to be used, which incorporates the fixed classification rule
+The bias corrected estimator {\sl .632+} is computed by
+<<cvindirect, echo=TRUE>>=
+errorest(formula.indirect,
+ data = GlaucomaMVF, model = inclass,
+ estimator = "632plus",
+ pFUN = list(list(model = lm)), cFUN = classify)
+@
+Because of the subdivision of variables and a formula describing the
+modeling between explanatory and intermediate variables only,
+we must call the class membership variable. Hence, in contrast to the
+function \texttt{inclass} the data set \texttt{GlaucomaMVF} used in
+\texttt{errorest} must contain explanatory, intermediate and response
+variables.
+
+Sometimes it may be necessary to reduce the number of predictors before
+training a classifier. Estimating the error rate after the variable
+selection leads to biased estimates of the misclassfication error and
+therefore one should estimate the error rate of the whole procedure. Within
+the \texttt{errorest} framework, this can be done as follows. First, we define
+a function which does both variable selection and training of the
+classifier. For illustration proposes, we select the predictors by comparing
+their univariate $P$-values of a two-sample $t$-test with a prespecified
+level and train a LDA using the selected variables only.
+
+<<varsel-def, echo=TRUE>>=
+mymod <- function(formula, data, level=0.05) {
+ # select all predictors that are associated with an
+ # univariate t.test p-value of less that level
+ sel <- which(lapply(data, function(x) {
+ if (!is.numeric(x))
+ return(1)
+ else
+ return(t.test(x ~ data$Class)$p.value)
+ }) < level)
+ # make sure that the response is still there
+ sel <- c(which(colnames(data) %in% "Class"), sel)
+ # compute a LDA using the selected predictors only
+ mod <- lda(formula , data=data[,sel])
+ # and return a function for prediction
+ function(newdata) {
+ predict(mod, newdata=newdata[,sel])$class
+ }
+}
+@
+
+Note that \texttt{mymod} does not return
+an object of class \texttt{lda} but a function
+with argument \texttt{newdata} only. Thanks to lexical scoping, this
+function is used for computing
+predicted classes instead of a function \texttt{predict} passed to
+\texttt{errorest} as argument. Computing a $5$-fold cross-validated error rate
+estimator now is approximately a one-liner.
+
+<<varsel-comp, echo=TRUE>>=
+errorest(Class ~ . , data=GlaucomaM, model=mymod, estimator = "cv",
+est.para=control.errorest(k=5))
+@
+
+
+%%To summarize the performance of the different classification techniques in the considered example of glaucoma diagnosis, the 10-fold
+%%cross-validated error estimator delivers the
+%%results given in Table \ref{tenf}.
+%%\begin{figure}
+%%\begin{center}
+%%\begin{tabular}{ rrr }
+%%\hline
+%%dataset & method & error estimate \\
+%%\hline
+%%\texttt{GlaucomaM} & {\sl slda} & 0.168 \\
+%%\texttt{GlaucomaM} & {\sl bagging} & 0.158 \\
+%%\texttt{GlaucomaM} & {\sl double-bagging} & 0.153 \\
+%%\texttt{GlaucomaMVF} & {\sl inclass-bagging} & 0.206 \\
+%%\tetxtt{GlaucomaMVF} & {\sl inclass-lm} & 0.229 \\
+%%\hline
+%%\end{tabular}
+%%\caption{10-fold cross-validated error estimation of
+%%the misclassification error for several classification
+%%methods: {\sl slda} - stabilised linear discriminant analysis,
+%%{\sl bagging} - bagging with 50 bootstrap samples,
+%%{\sl double-bagging} - bagging with 50 bootstrap samples,
+%%combined with sLDA, {\sl inclass-bagging} -
+%%indirect classification using bagging,
+%%{\sl inclass-lm} indirect classification using
+%%linear modeling. \label{tenf}}
+%%\end{center}
+%%\end{figure}
+%%Note that an estimator of the variance is available for the ordinary
+%%bootstrap estimator (\texttt{estimator="boot"}) only, see \cite{efron:1997}.
+
+
+\section{Summary}
+\ipred tries to implement a unified interface to some recent developments
+in classification and error rate estimation. It is by no means finished
+nor perfect and we very much appreciate comments, suggestions and criticism.
+Currently, the major drawback is speed. Calling \texttt{rpart} $50$
+times for each bootstrap sample is relatively inefficient
+but the design of interfaces was our main focus instead of optimization.
+Beside the examples shown, \texttt{bagging} can be used to compute bagging
+for regression trees and \texttt{errorest} computes estimators of the
+mean squared error for regression models.
+
+\bibliographystyle{plainnat}
+\bibliography{ipred}
+
+
+\end{document}
diff --git a/inst/doc/ipred-examples.pdf b/inst/doc/ipred-examples.pdf
new file mode 100644
index 0000000..2d6499a
Binary files /dev/null and b/inst/doc/ipred-examples.pdf differ
diff --git a/man/DLBCL.Rd b/man/DLBCL.Rd
new file mode 100644
index 0000000..6d27fa3
--- /dev/null
+++ b/man/DLBCL.Rd
@@ -0,0 +1,57 @@
+\name{DLBCL}
+\alias{DLBCL}
+\non_function{}
+\title{ Diffuse Large B-Cell Lymphoma }
+\usage{data("DLBCL")}
+\description{
+A data frame with gene expression data from diffuse large B-cell
+lymphoma (DLBCL) patients.
+}
+\format{
+ This data frame contains the following columns:
+ \describe{
+ \item{DLCL.Sample}{DLBCL identifier.}
+ \item{Gene.Expression}{Gene expression group.}
+ \item{time}{survival time in month.}
+ \item{cens}{censoring: 0 censored, 1 dead.}
+ \item{IPI}{International prognostic index.}
+ \item{MGEc.1}{mean gene expression in cluster 1.}
+ \item{MGEc.2}{mean gene expression in cluster 2.}
+ \item{MGEc.3}{mean gene expression in cluster 3.}
+ \item{MGEc.4}{mean gene expression in cluster 4.}
+ \item{MGEc.5}{mean gene expression in cluster 5.}
+ \item{MGEc.6}{mean gene expression in cluster 6.}
+ \item{MGEc.7}{mean gene expression in cluster 7.}
+ \item{MGEc.8}{mean gene expression in cluster 8.}
+ \item{MGEc.9}{mean gene expression in cluster 9.}
+ \item{MGEc.10}{mean gene expression in cluster 10.}
+ }
+}
+\source{
+Except of \code{MGE}, the data is published at
+\url{http://llmpp.nih.gov/lymphoma/data.shtml}. \code{MGEc.*} is the mean of
+the gene expression in each of ten clusters derived by agglomerative average
+linkage hierarchical cluster analysis (Hothorn et al., 2002).
+
+}
+\references{
+Ash A. Alizadeh et. al (2000), Distinct types of diffuse large
+B-cell lymphoma identified by gene
+expression profiling. \emph{Nature}, \bold{403}, 504--509.
+
+Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
+Radespiel-Troeger (2004), Bagging Survival Trees.
+\emph{Statistics in Medicine}, \bold{23}, 77--91.
+
+
+}
+\examples{
+
+set.seed(290875)
+
+data("DLBCL", package="ipred")
+library("survival")
+survfit(Surv(time, cens) ~ 1, data=DLBCL)
+
+}
+\keyword{datasets}
diff --git a/man/GlaucomaMVF.Rd b/man/GlaucomaMVF.Rd
new file mode 100644
index 0000000..3337e71
--- /dev/null
+++ b/man/GlaucomaMVF.Rd
@@ -0,0 +1,134 @@
+\name{GlaucomaMVF}
+\alias{GlaucomaMVF}
+\non_function{}
+\title{ Glaucoma Database }
+\usage{data("GlaucomaMVF")}
+\description{
+The \code{GlaucomaMVF} data has 170 observations in two classes.
+66 predictors are derived from a confocal laser scanning image of the
+optic nerve head, from a visual field test, a fundus photography and a
+measurement of the intra occular pressure.
+}
+\format{
+ This data frame contains the following predictors describing the
+ morphology of the optic nerve head, the visual field, the intra
+ occular pressure and a membership variable:
+ \describe{
+ \item{ag}{area global.}
+ \item{at}{area temporal.}
+ \item{as}{area superior.}
+ \item{an}{area nasal.}
+ \item{ai}{area inferior.}
+ \item{eag}{effective area global.}
+ \item{eat}{effective area temporal.}
+ \item{eas}{effective area superior.}
+ \item{ean}{effective area nasal.}
+ \item{eai}{effective area inferior.}
+ \item{abrg}{area below reference global.}
+ \item{abrt}{area below reference temporal.}
+ \item{abrs}{area below reference superior.}
+ \item{abrn}{area below reference nasal.}
+ \item{abri}{area below reference inferior.}
+ \item{hic}{height in contour.}
+ \item{mhcg}{mean height contour global.}
+ \item{mhct}{mean height contour temporal.}
+ \item{mhcs}{mean height contour superior.}
+ \item{mhcn}{mean height contour nasal.}
+ \item{mhci}{mean height contour inferior.}
+ \item{phcg}{peak height contour.}
+ \item{phct}{peak height contour temporal.}
+ \item{phcs}{peak height contour superior.}
+ \item{phcn}{peak height contour nasal.}
+ \item{phci}{peak height contour inferior.}
+ \item{hvc}{height variation contour.}
+ \item{vbsg}{volume below surface global.}
+ \item{vbst}{volume below surface temporal.}
+ \item{vbss}{volume below surface superior.}
+ \item{vbsn}{volume below surface nasal.}
+ \item{vbsi}{volume below surface inferior.}
+ \item{vasg}{volume above surface global.}
+ \item{vast}{volume above surface temporal.}
+ \item{vass}{volume above surface superior.}
+ \item{vasn}{volume above surface nasal.}
+ \item{vasi}{volume above surface inferior.}
+ \item{vbrg}{volume below reference global.}
+ \item{vbrt}{volume below reference temporal.}
+ \item{vbrs}{volume below reference superior.}
+ \item{vbrn}{volume below reference nasal.}
+ \item{vbri}{volume below reference inferior.}
+ \item{varg}{volume above reference global.}
+ \item{vart}{volume above reference temporal.}
+ \item{vars}{volume above reference superior.}
+ \item{varn}{volume above reference nasal.}
+ \item{vari}{volume above reference inferior.}
+ \item{mdg}{mean depth global.}
+ \item{mdt}{mean depth temporal.}
+ \item{mds}{mean depth superior.}
+ \item{mdn}{mean depth nasal.}
+ \item{mdi}{mean depth inferior.}
+ \item{tmg}{third moment global.}
+ \item{tmt}{third moment temporal.}
+ \item{tms}{third moment superior.}
+ \item{tmn}{third moment nasal.}
+ \item{tmi}{third moment inferior.}
+ \item{mr}{mean radius.}
+ \item{rnf}{retinal nerve fiber thickness.}
+ \item{mdic}{mean depth in contour.}
+ \item{emd}{effective mean depth.}
+ \item{mv}{mean variability.}
+ \item{tension}{intra occular pressure.}
+ \item{clv}{corrected loss variance, variability of the visual field.}
+ \item{cs}{contrast sensitivity of the visual field.}
+ \item{lora}{loss of rim area, measured by fundus photography.}
+ \item{Class}{a factor with levels \code{glaucoma} and \code{normal}.}
+ }
+}
+\details{
+Confocal laser images of the eye background are taken with the
+Heidelberg Retina Tomograph and variables 1-62 are derived.
+Most of these variables describe either the area or volume in
+certain parts of the papilla and are measured in
+four sectors (temporal, superior, nasal and inferior) as well
+as for the whole papilla (global). The global measurement is,
+roughly, the sum of the measurements taken in the four sector.
+
+The perimeter `Octopus' measures the visual field variables \code{clv}
+and \code{cs}, stereo optic disks photographs were taken with a
+telecentric fundus camera and \code{lora} is derived.
+
+Observations of both groups are matched by age and sex,
+to prevent for possible confounding.
+
+}
+\source{
+Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
+Diagnosis of glaucoma by indirect classifiers.
+\emph{Methods of Information in Medicine} \bold{1}, 99-103.
+}
+
+\note{
+ \code{GLaucomMVF} overlaps in some parts with \code{\link[TH.data]{GlaucomaM}}.
+}
+
+\examples{
+\dontrun{
+
+data("GlaucomaMVF", package = "ipred")
+library("rpart")
+
+response <- function (data) {
+ attach(data)
+ res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >=
+ 49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) &
+ clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) &
+ !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) |
+ (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
+ detach(data)
+ factor (res, labels = c("glaucoma", "normal"))
+}
+
+errorest(Class~clv+lora+cs~., data = GlaucomaMVF, model=inclass,
+ estimator="cv", pFUN = list(list(model = rpart)), cFUN = response)
+}
+}
+\keyword{datasets}
diff --git a/man/Smoking.Rd b/man/Smoking.Rd
new file mode 100644
index 0000000..0bc0146
--- /dev/null
+++ b/man/Smoking.Rd
@@ -0,0 +1,38 @@
+\name{Smoking}
+\alias{Smoking}
+\non_function{}
+\title{Smoking Styles}
+\usage{data("Smoking")}
+\description{
+The \code{Smoking} data frame has 55 rows and 9 columns.
+}
+\format{
+ This data frame contains the following columns:
+ \describe{
+ \item{NR}{numeric, patient number.}
+ \item{Sex}{factor, sex of patient.}
+ \item{Age}{factor, age group of patient, grouping consisting of those in their twenties, those in their thirties and so on.}
+ \item{TarY}{numeric, tar yields of the cigarettes.}
+ \item{NicY}{numeric, nicotine yields of the cigarettes.}
+ \item{COY}{numeric, carbon monoxide (CO) yield of the cigarettes.}
+ \item{TVPS}{numeric, total volume puffed smoke.}
+ \item{BPNL}{numeric, blood plasma nicotine level.}
+ \item{COHB}{numeric, carboxyhaemoglobin level, i.e. amount of CO absorbed by the blood stream.}
+ }
+}
+
+\details{
+ The data describes different smoking habits of probands.
+}
+
+\source{
+ Hand and Taylor (1987), Study F \emph{Smoking Styles}.
+}
+
+\references{
+D.J. Hand and C.C. Taylor (1987),
+\emph{Multivariate analysis of variance and repeated measures.} London: Chapman \&
+Hall, pp. 167--181.
+}
+
+\keyword{datasets}
diff --git a/man/bagging.Rd b/man/bagging.Rd
new file mode 100644
index 0000000..47be547
--- /dev/null
+++ b/man/bagging.Rd
@@ -0,0 +1,238 @@
+\name{bagging}
+\alias{bagging}
+\alias{ipredbagg}
+\alias{ipredbagg.factor}
+\alias{ipredbagg.integer}
+\alias{ipredbagg.numeric}
+\alias{ipredbagg.Surv}
+\alias{ipredbagg.default}
+\alias{bagging.data.frame}
+\alias{bagging.default}
+\title{Bagging Classification, Regression and Survival Trees }
+\description{
+ Bagging for classification, regression and survival trees.
+}
+\usage{
+\method{ipredbagg}{factor}(y, X=NULL, nbagg=25, control=
+ rpart.control(minsplit=2, cp=0, xval=0),
+ comb=NULL, coob=FALSE, ns=length(y), keepX = TRUE, \dots)
+\method{ipredbagg}{numeric}(y, X=NULL, nbagg=25, control=rpart.control(xval=0),
+ comb=NULL, coob=FALSE, ns=length(y), keepX = TRUE, \dots)
+\method{ipredbagg}{Surv}(y, X=NULL, nbagg=25, control=rpart.control(xval=0),
+ comb=NULL, coob=FALSE, ns=dim(y)[1], keepX = TRUE, \dots)
+\method{bagging}{data.frame}(formula, data, subset, na.action=na.rpart, \dots)
+}
+\arguments{
+ \item{y}{the response variable: either a factor vector of class labels
+ (bagging classification trees), a vector of numerical values
+ (bagging regression trees) or an object of class
+ \code{\link[survival]{Surv}} (bagging survival trees).}
+ \item{X}{a data frame of predictor variables.}
+ \item{nbagg}{an integer giving the number of bootstrap replications. }
+ \item{coob}{a logical indicating whether an out-of-bag estimate of the
+ error rate (misclassification error, root mean squared error
+ or Brier score) should be computed.
+ See \code{\link{predict.classbagg}} for
+ details.}
+ \item{control}{options that control details of the \code{rpart}
+ algorithm, see \code{\link[rpart]{rpart.control}}. It is
+ wise to set \code{xval = 0} in order to save computing
+ time. Note that the
+ default values depend on the class of \code{y}.}
+ \item{comb}{a list of additional models for model combination, see below
+for some examples. Note that argument \code{method} for double-bagging is no longer there,
+\code{comb} is much more flexible.}
+ \item{ns}{number of sample to draw from the learning sample. By default,
+ the usual bootstrap n out of n with replacement is performed.
+ If \code{ns} is smaller than \code{length(y)}, subagging
+ (Buehlmann and Yu, 2002), i.e. sampling \code{ns} out of
+ \code{length(y)} without replacement, is performed.}
+ \item{keepX}{a logical indicating whether the data frame of predictors
+ should be returned. Note that the computation of the
+ out-of-bag estimator requires \code{keepX=TRUE}.}
+ \item{formula}{a formula of the form \code{lhs ~ rhs} where \code{lhs}
+ is the response variable and \code{rhs} a set of
+ predictors.}
+ \item{data}{optional data frame containing the variables in the
+ model formula.}
+ \item{subset}{optional vector specifying a subset of observations
+ to be used.}
+ \item{na.action}{function which indicates what should happen when
+ the data contain \code{NA}s. Defaults to
+ \code{\link[rpart]{na.rpart}}.}
+ \item{...}{additional parameters passed to \code{ipredbagg} or
+\code{\link[rpart]{rpart}}, respectively.}
+}
+
+\details{
+
+The random forest implementations \code{\link[randomForest]{randomForest}}
+and \code{\link[party]{cforest}} are more flexible and reliable for computing
+bootstrap-aggregated trees than this function and should be used instead.
+
+ Bagging for classification and regression trees were suggested by
+Breiman (1996a, 1998) in order to stabilise trees.
+
+The trees in this function are computed using the implementation in the
+\code{\link[rpart]{rpart}} package. The generic function \code{ipredbagg}
+implements methods for different responses. If \code{y} is a factor,
+classification trees are constructed. For numerical vectors
+\code{y}, regression trees are aggregated and if \code{y} is a survival
+object, bagging survival trees (Hothorn et al, 2003) is performed.
+The function \code{bagging} offers a formula based interface to
+\code{ipredbagg}.
+
+\code{nbagg} bootstrap samples are drawn and a tree is constructed
+for each of them. There is no general rule when to stop the tree
+growing. The size of the
+trees can be controlled by \code{control} argument
+or \code{\link{prune.classbagg}}. By
+default, classification trees are as large as possible whereas regression
+trees and survival trees are build with the standard options of
+\code{\link[rpart]{rpart.control}}. If \code{nbagg=1}, one single tree is
+computed for the whole learning sample without bootstrapping.
+
+If \code{coob} is TRUE, the out-of-bag sample (Breiman,
+1996b) is used to estimate the prediction error
+corresponding to \code{class(y)}. Alternatively, the out-of-bag sample can
+be used for model combination, an out-of-bag error rate estimator is not
+available in this case. Double-bagging (Hothorn and Lausen,
+2003) computes a LDA on the out-of-bag sample and uses the discriminant
+variables as additional predictors for the classification trees. \code{comb}
+is an optional list of lists with two elements \code{model} and \code{predict}.
+\code{model} is a function with arguments \code{formula} and \code{data}.
+\code{predict} is a function with arguments \code{object, newdata} only. If
+the estimation of the covariance matrix in \code{\link{lda}} fails due to a
+limited out-of-bag sample size, one can use \code{\link{slda}} instead.
+See the example section for an example of double-bagging. The methodology is
+not limited to a combination with LDA: bundling (Hothorn and Lausen, 2002b)
+can be used with arbitrary classifiers.
+
+NOTE: Up to ipred version 0.9-0, bagging was performed using a modified version
+of the original rpart function. Due to interface changes in rpart 3.1-55, the
+bagging function had to be rewritten. Results of previous version are not
+exactly reproducible.
+
+}
+
+\value{
+ The class of the object returned depends on \code{class(y)}:
+\code{classbagg, regbagg} and \code{survbagg}. Each is a list with elements
+\item{y}{the vector of responses.}
+\item{X}{the data frame of predictors.}
+\item{mtrees}{multiple trees: a list of length \code{nbagg} containing the
+trees (and possibly additional objects) for each bootstrap sample.}
+\item{OOB}{logical whether the out-of-bag estimate should be computed.}
+\item{err}{if \code{OOB=TRUE}, the out-of-bag estimate of
+misclassification or root mean squared error or the Brier score for censored
+data.}
+\item{comb}{logical whether a combination of models was requested.}
+
+ For each class methods for the generics \code{\link[rpart]{prune.rpart}},
+\code{\link{print}}, \code{\link{summary}} and \code{\link{predict}} are
+available for inspection of the results and prediction, for example:
+\code{\link{print.classbagg}}, \code{\link{summary.classbagg}},
+\code{\link{predict.classbagg}} and \code{\link{prune.classbagg}} for
+classification problems.
+
+}
+
+\references{
+
+Leo Breiman (1996a), Bagging Predictors. \emph{Machine Learning}
+\bold{24}(2), 123--140.
+
+Leo Breiman (1996b), Out-Of-Bag Estimation. \emph{Technical Report}
+\url{http://www.stat.berkeley.edu/~breiman/OOBestimation.pdf}.
+
+Leo Breiman (1998), Arcing Classifiers. \emph{The Annals of Statistics}
+\bold{26}(3), 801--824.
+
+Peter Buehlmann and Bin Yu (2002), Analyzing Bagging. \emph{The Annals of
+Statistics} \bold{30}(4), 927--961.
+
+Torsten Hothorn and Berthold Lausen (2003), Double-Bagging: Combining
+classifiers by bootstrap aggregation. \emph{Pattern Recognition},
+\bold{36}(6), 1303--1309.
+
+Torsten Hothorn and Berthold Lausen (2005), Bundling Classifiers by Bagging
+Trees. \emph{Computational Statistics & Data Analysis}, 49, 1068--1078.
+
+Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
+Radespiel-Troeger (2004), Bagging Survival Trees.
+\emph{Statistics in Medicine}, \bold{23}(1), 77--91.
+
+}
+
+\examples{
+
+library("MASS")
+library("survival")
+
+# Classification: Breast Cancer data
+
+data("BreastCancer", package = "mlbench")
+
+# Test set error bagging (nbagg = 50): 3.7\% (Breiman, 1998, Table 5)
+
+mod <- bagging(Class ~ Cl.thickness + Cell.size
+ + Cell.shape + Marg.adhesion
+ + Epith.c.size + Bare.nuclei
+ + Bl.cromatin + Normal.nucleoli
+ + Mitoses, data=BreastCancer, coob=TRUE)
+print(mod)
+
+# Test set error bagging (nbagg=50): 7.9\% (Breiman, 1996a, Table 2)
+data("Ionosphere", package = "mlbench")
+Ionosphere$V2 <- NULL # constant within groups
+
+bagging(Class ~ ., data=Ionosphere, coob=TRUE)
+
+# Double-Bagging: combine LDA and classification trees
+
+# predict returns the linear discriminant values, i.e. linear combinations
+# of the original predictors
+
+comb.lda <- list(list(model=lda, predict=function(obj, newdata)
+ predict(obj, newdata)$x))
+
+# Note: out-of-bag estimator is not available in this situation, use
+# errorest
+
+mod <- bagging(Class ~ ., data=Ionosphere, comb=comb.lda)
+
+predict(mod, Ionosphere[1:10,])
+
+# Regression:
+
+data("BostonHousing", package = "mlbench")
+
+# Test set error (nbagg=25, trees pruned): 3.41 (Breiman, 1996a, Table 8)
+
+mod <- bagging(medv ~ ., data=BostonHousing, coob=TRUE)
+print(mod)
+
+library("mlbench")
+learn <- as.data.frame(mlbench.friedman1(200))
+
+# Test set error (nbagg=25, trees pruned): 2.47 (Breiman, 1996a, Table 8)
+
+mod <- bagging(y ~ ., data=learn, coob=TRUE)
+print(mod)
+
+# Survival data
+
+# Brier score for censored data estimated by
+# 10 times 10-fold cross-validation: 0.2 (Hothorn et al,
+# 2002)
+
+data("DLBCL", package = "ipred")
+mod <- bagging(Surv(time,cens) ~ MGEc.1 + MGEc.2 + MGEc.3 + MGEc.4 + MGEc.5 +
+ MGEc.6 + MGEc.7 + MGEc.8 + MGEc.9 +
+ MGEc.10 + IPI, data=DLBCL, coob=TRUE)
+
+print(mod)
+
+
+}
+\keyword{tree}
diff --git a/man/bootest.Rd b/man/bootest.Rd
new file mode 100644
index 0000000..9d9f603
--- /dev/null
+++ b/man/bootest.Rd
@@ -0,0 +1,48 @@
+\name{bootest}
+\alias{bootest}
+\alias{bootest.default}
+\alias{bootest.factor}
+\alias{bootest.numeric}
+\alias{bootest.integer}
+\alias{bootest.Surv}
+\title{Bootstrap Error Rate Estimators}
+\description{
+ Those functions are low-level functions used by \code{\link{errorest}} and
+are normally not called by users.
+}
+\usage{
+\method{bootest}{factor}(y, formula, data, model, predict, nboot=25,
+bc632plus=FALSE, list.tindx = NULL, predictions = FALSE,
+both.boot = FALSE, \dots)}
+\arguments{
+ \item{y}{the response variable, either of class \code{factor}
+(classification), \code{numeric} (regression) or \code{Surv} (survival).}
+ \item{formula}{a formula object.}
+ \item{data}{data frame of predictors and response described in
+\code{formula}.}
+ \item{model}{a function implementing the predictive model to be
+evaluated. The function \code{model} can either return an
+ object representing a fitted model or a function with
+ argument \code{newdata} which returns predicted values. In
+ this case, the \code{predict} argument to \code{errorest} is
+ ignored.}
+ \item{predict}{a function with arguments \code{object} and \code{newdata}
+only which predicts the status of the observations in \code{newdata} based
+on the fitted model in \code{object}.}
+ \item{nboot}{number of bootstrap replications to be used.}
+ \item{bc632plus}{logical. Should the bias corrected version of misclassification
+ error be computed?}
+ \item{predictions}{logical, return a matrix of predictions. The ith column
+contains predictions of the ith out-of-bootstrap sample and 'NA's
+corresponding to the ith bootstrap sample.}
+ \item{list.tindx}{list of numeric vectors, indicating which
+ observations are included in each bootstrap sample.}
+ \item{both.boot}{logical, return both (bootstrap and 632plus) estimations or
+ only one of them.}
+ \item{\dots}{additional arguments to \code{model}.}
+}
+
+\details{
+ See \code{\link{errorest}}.
+}
+\keyword{misc}
diff --git a/man/control.errorest.Rd b/man/control.errorest.Rd
new file mode 100644
index 0000000..fb24b9b
--- /dev/null
+++ b/man/control.errorest.Rd
@@ -0,0 +1,33 @@
+\name{control.errorest}
+\alias{control.errorest}
+\title{ Control Error Rate Estimators }
+\description{
+ Some parameters that control the behaviour of \code{\link{errorest}}.
+}
+\usage{
+control.errorest(k = 10, nboot = 25, strat = FALSE, random = TRUE,
+ predictions = FALSE, getmodels=FALSE, list.tindx = NULL)
+}
+\arguments{
+ \item{k}{integer, specify $k$ for $k$-fold cross-validation.}
+ \item{nboot}{integer, number of bootstrap replications.}
+ \item{strat}{logical, if \code{TRUE}, cross-validation is performed
+ using stratified sampling (for classification problems).}
+ \item{random}{logical, if \code{TRUE}, cross-validation is performed using
+ a random ordering of the data.}
+ \item{predictions}{logical, indicates whether the prediction
+ for each observation should be returned or not
+ (classification and regression only). For a bootstrap
+based estimator a matrix of size 'number of observations' times nboot is
+returned with predicted values of the ith out-of-bootstrap sample in column
+i and 'NA's for those observations not included in the ith out-of-bootstrap
+sample.}
+ \item{getmodels}{logical, indicates a list of all models should be
+ returned. For cross-validation only.}
+ \item{list.tindx}{list of numeric vectors, indicating which
+ observations are included in each bootstrap or cross-validation sample, respectively.}
+}
+\value{
+ A list with the same components as arguments.
+}
+\keyword{misc}
diff --git a/man/cv.Rd b/man/cv.Rd
new file mode 100644
index 0000000..6c7876d
--- /dev/null
+++ b/man/cv.Rd
@@ -0,0 +1,46 @@
+\name{cv}
+\alias{cv}
+\alias{cv.default}
+\alias{cv.factor}
+\alias{cv.numeric}
+\alias{cv.integer}
+\alias{cv.Surv}
+\title{Cross-validated Error Rate Estimators.}
+\description{
+ Those functions are low-level functions used by \code{\link{errorest}} and
+are normally not called by users.
+}
+\usage{
+\method{cv}{factor}(y, formula, data, model, predict, k=10, random=TRUE,
+ strat=FALSE,
+ predictions=NULL, getmodels=NULL, list.tindx = NULL, \dots)
+}
+\arguments{
+ \item{y}{response variable, either of class \code{factor}
+(classification), \code{numeric} (regression) or \code{Surv} (survival).}
+ \item{formula}{a formula object.}
+ \item{data}{data frame of predictors and response described in \code{formula}.}
+ \item{model}{a function implementing the predictive model to be
+evaluated. The function \code{model} can either return an
+ object representing a fitted model or a function with
+ argument \code{newdata} which returns predicted values. In
+ this case, the \code{predict} argument to \code{errorest} is
+ ignored.}
+ \item{predict}{a function with arguments \code{object} and \code{newdata}
+only which predicts the status of the observations in \code{newdata} based
+on the fitted model in \code{object}.}
+ \item{k}{k-fold cross-validation.}
+ \item{random}{logical, indicates whether a random order or the given
+order of the data should be used for sample splitting or not, defaults to
+\code{TRUE}.}
+ \item{strat}{logical, stratified sampling or not, defaults to \code{FALSE}.}
+ \item{predictions}{logical, return the prediction of each observation.}
+ \item{getmodels}{logical, return a list of models for each fold.}
+ \item{list.tindx}{list of numeric vectors, indicating which
+ observations are included in each cross-validation sample.}
+ \item{\dots}{additional arguments to \code{model}.}
+}
+\details{
+ See \code{\link{errorest}}.
+}
+\keyword{misc}
diff --git a/man/dystrophy.Rd b/man/dystrophy.Rd
new file mode 100644
index 0000000..be9051a
--- /dev/null
+++ b/man/dystrophy.Rd
@@ -0,0 +1,59 @@
+\name{dystrophy}
+\alias{dystrophy}
+\non_function{}
+\title{Detection of muscular dystrophy carriers.}
+\usage{data(dystrophy)}
+\description{
+The \code{dystrophy} data frame has 209 rows and 10 columns.
+}
+\format{
+ This data frame contains the following columns:
+ \describe{
+ \item{OBS}{numeric. Observation number.}
+ \item{HospID}{numeric. Hospital ID number.}
+ \item{AGE}{numeric, age in years.}
+ \item{M}{numeric. Month of examination.}
+ \item{Y}{numeric. Year of examination.}
+ \item{CK}{numeric. Serum marker creatine kinase.}
+ \item{H}{numeric. Serum marker hemopexin.}
+ \item{PK}{numeric. Serum marker pyruvate kinase.}
+ \item{LD}{numeric. Serum marker lactate dehydroginase.}
+ \item{Class}{factor with levels, \code{carrier} and \code{normal}.}
+ }
+}
+
+\details{
+Duchenne Muscular Dystrophy (DMD) is a genetically transmitted disease,
+passed from a mother to her children. Affected female offspring usually suffer
+no apparent symptoms, male offspring with the disease die at young age.
+Although female carriers have no physical symptoms they tend to exhibit
+elevated levels of certain serum enzymes or proteins.
+\cr
+The dystrophy dataset contains 209 observations of 75 female DMD carriers and
+134 female DMD non-carrier. It includes 6 variables describing age of the
+female and the serum parameters serum marker creatine kinase (CK), serum marker
+ hemopexin (H), serum marker pyruvate kinase (PK) and serum marker lactate
+dehydroginase (LD). The serum markers CK and H may be measured rather
+inexpensive from frozen serum, PK and LD requires fresh serum.
+}
+
+\source{
+D.Andrews and A. Herzberg (1985), Data. Berlin: Springer-Verlag.
+}
+
+\references{
+Robert Tibshirani and Geoffry Hinton (1998), Coaching variables for regression and classification. Statistics and Computing 8, 25-33.
+}
+
+\examples{
+\dontrun{
+
+data("dystrophy")
+library("rpart")
+errorest(Class~CK+H~AGE+PK+LD, data = dystrophy, model = inbagg,
+pFUN = list(list(model = lm, predict = mypredict.lm), list(model = rpart)),
+ns = 0.75, estimator = "cv")
+}
+}
+
+\keyword{datasets}
diff --git a/man/errorest.Rd b/man/errorest.Rd
new file mode 100644
index 0000000..961b146
--- /dev/null
+++ b/man/errorest.Rd
@@ -0,0 +1,257 @@
+\name{errorest}
+\alias{errorest}
+\alias{errorest.data.frame}
+\alias{errorest.default}
+\title{ Estimators of Prediction Error }
+\description{
+Resampling based estimates of prediction error: misclassification error,
+root mean squared error or Brier score for survival data.
+}
+\usage{
+\method{errorest}{data.frame}(formula, data, subset, na.action=na.omit,
+ model=NULL, predict=NULL,
+ estimator=c("cv", "boot", "632plus"),
+ est.para=control.errorest(), ...)
+}
+
+\arguments{
+ \item{formula}{a formula of the form \code{lhs ~ rhs}.
+ Either describing the model of explanatory and
+ response variables in the usual way (see \code{\link{lm}})
+ or the model between explanatory and intermediate variables
+ in the framework of indirect classification,
+ see \code{\link{inclass}}.}
+ \item{data}{a data frame containing the variables in the model formula
+ and additionally the class membership variable
+ if \code{model = inclass}. \code{data} is required for
+ indirect classification, otherwise \code{formula} is evaluated
+ in the calling environment.}
+ \item{subset}{optional vector, specifying a subset of observations to
+ be used.}
+ \item{na.action}{function which indicates what should happen when the data
+ contains \code{NA}'s, defaults to \code{\link{na.omit}}.}
+ \item{model}{function. Modelling technique whose error rate is to be
+ estimated. The function \code{model} can either return an
+ object representing a fitted model or a function with
+ argument \code{newdata} which returns predicted values. In
+ this case, the \code{predict} argument to \code{errorest} is
+ ignored.}
+ \item{predict}{function. Prediction method to be used. The vector of
+ predicted values must have the same length as the the
+ number of to-be-predicted observations. Predictions
+ corresponding to missing data must be replaced by \code{NA}.
+ Additionally, \code{predict} has to return predicted values
+ comparable to the responses (that is: factors for
+ classification problems). See the example on how to make
+ this sure for any predictor.}
+ \item{estimator}{estimator of the misclassification error:
+ \code{cv} cross-validation, \code{boot} bootstrap or
+ \code{632plus} bias corrected bootstrap (classification
+ only). }
+ \item{est.para}{a list of additional parameters that control the
+ calculation of the estimator, see
+ \code{\link{control.errorest}} for details.}
+ \item{\dots}{additional parameters to \code{model}.}
+}
+\details{
+ The prediction error for classification and regression models as well as
+predictive models for censored data using cross-validation or the
+bootstrap can be computed by \code{errorest}. For classification problems,
+the estimated misclassification error is returned. The root mean squared
+error is computed for regression problems and the Brier score for censored
+data (Graf et al., 1999) is reported if the response is censored.
+
+Any model can be specified as long as it is a function with arguments
+\code{model(formula, data, subset, na.action, ...)}. If
+a method \code{predict.model(object, newdata, ...)} is available,
+\code{predict} does not need to be specified. However, \code{predict}
+has to return predicted values in the same order and of the same length
+corresponding to the response. See the examples below.
+
+$k$-fold cross-validation and the usual bootstrap estimator with
+\code{est.para$nboot} bootstrap replications can be computed for
+all kind of problems. The bias corrected .632+ bootstrap
+by Efron and Tibshirani (1997) is available for classification problems
+only. Use \code{\link{control.errorest}} to specify additional arguments.
+
+\code{errorest} is a formula based interface to the generic functions
+\code{\link{cv}} or \code{\link{bootest}} which implement methods for
+classification, regression and survival problems.
+}
+
+\value{
+ The class of the object returned depends on the class of the response
+variable and the estimator used. In each case, it is a list with an element
+\code{error} and additional information. \code{print} methods are available
+for the inspection of the results.
+}
+
+\references{
+
+Brian D. Ripley (1996), \emph{Pattern Recognition and Neural Networks}.
+Cambridge: Cambridge University Press.
+
+Bradley Efron and Robert Tibshirani (1997),
+Improvements on Cross-Validation: The .632+ Bootstrap Estimator.
+\emph{Journal of the American Statistical Association} \bold{92}(438),
+548--560.
+
+Erika Graf, Claudia Schmoor, Willi Sauerbrei and Martin Schumacher (1999),
+Assessment and comparison of prognostic classification schemes for
+survival data. \emph{Statistics in Medicine} \bold{18}(17-18), 2529--2545.
+
+Rosa A. Schiavo and David J. Hand (2000), Ten More Years of Error Rate
+Research. \emph{International Statistical Review} \bold{68}(3), 296-310.
+
+David J. Hand, Hua Gui Li, Niall M. Adams (2001),
+Supervised Classification with Structured Class Definitions.
+\emph{Computational Statistics & Data Analysis} \bold{36},
+209--225.
+
+
+}
+\examples{
+
+# Classification
+
+data("iris")
+library("MASS")
+
+# force predict to return class labels only
+mypredict.lda <- function(object, newdata)
+ predict(object, newdata = newdata)$class
+
+# 10-fold cv of LDA for Iris data
+errorest(Species ~ ., data=iris, model=lda,
+ estimator = "cv", predict= mypredict.lda)
+
+data("PimaIndiansDiabetes", package = "mlbench")
+\dontrun{
+# 632+ bootstrap of LDA for Diabetes data
+errorest(diabetes ~ ., data=PimaIndiansDiabetes, model=lda,
+ estimator = "632plus", predict= mypredict.lda)
+}
+
+#cv of a fixed partition of the data
+list.tindx <- list(1:100, 101:200, 201:300, 301:400, 401:500,
+ 501:600, 601:700, 701:768)
+
+errorest(diabetes ~ ., data=PimaIndiansDiabetes, model=lda,
+ estimator = "cv", predict = mypredict.lda,
+ est.para = control.errorest(list.tindx = list.tindx))
+
+\dontrun{
+#both bootstrap estimations based on fixed partitions
+
+list.tindx <- vector(mode = "list", length = 25)
+for(i in 1:25) {
+ list.tindx[[i]] <- sample(1:768, 768, TRUE)
+}
+
+errorest(diabetes ~ ., data=PimaIndiansDiabetes, model=lda,
+ estimator = c("boot", "632plus"), predict= mypredict.lda,
+ est.para = control.errorest(list.tindx = list.tindx))
+
+}
+data("Glass", package = "mlbench")
+
+# LDA has cross-validated misclassification error of
+# 38\% (Ripley, 1996, page 98)
+
+# Pruned trees about 32\% (Ripley, 1996, page 230)
+
+# use stratified sampling here, i.e. preserve the class proportions
+errorest(Type ~ ., data=Glass, model=lda,
+ predict=mypredict.lda, est.para=control.errorest(strat=TRUE))
+
+# force predict to return class labels
+mypredict.rpart <- function(object, newdata)
+ predict(object, newdata = newdata,type="class")
+
+library("rpart")
+pruneit <- function(formula, ...)
+ prune(rpart(formula, ...), cp =0.01)
+
+errorest(Type ~ ., data=Glass, model=pruneit,
+ predict=mypredict.rpart, est.para=control.errorest(strat=TRUE))
+
+# compute sensitivity and specifity for stabilised LDA
+
+data("GlaucomaM", package = "TH.data")
+
+error <- errorest(Class ~ ., data=GlaucomaM, model=slda,
+ predict=mypredict.lda, est.para=control.errorest(predictions=TRUE))
+
+# sensitivity
+
+mean(error$predictions[GlaucomaM$Class == "glaucoma"] == "glaucoma")
+
+# specifity
+
+mean(error$predictions[GlaucomaM$Class == "normal"] == "normal")
+
+# Indirect Classification: Smoking data
+
+data(Smoking)
+# Set three groups of variables:
+# 1) explanatory variables are: TarY, NicY, COY, Sex, Age
+# 2) intermediate variables are: TVPS, BPNL, COHB
+# 3) response (resp) is defined by:
+
+resp <- function(data){
+ data <- data[, c("TVPS", "BPNL", "COHB")]
+ res <- t(t(data) > c(4438, 232.5, 58))
+ res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
+ res
+}
+
+response <- resp(Smoking[ ,c("TVPS", "BPNL", "COHB")])
+smoking <- cbind(Smoking, response)
+
+formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age
+
+# Estimation per leave-one-out estimate for the misclassification is
+# 36.36\% (Hand et al., 2001), using indirect classification with
+# linear models
+\dontrun{
+errorest(formula, data = smoking, model = inclass,estimator = "cv",
+ pFUN = list(list(model=lm, predict = mypredict.lm)), cFUN = resp,
+ est.para=control.errorest(k=nrow(smoking)))
+}
+
+# Regression
+
+data("BostonHousing", package = "mlbench")
+
+# 10-fold cv of lm for Boston Housing data
+errorest(medv ~ ., data=BostonHousing, model=lm,
+ est.para=control.errorest(random=FALSE))
+
+# the same, with "model" returning a function for prediction
+# instead of an object of class "lm"
+
+mylm <- function(formula, data) {
+ mod <- lm(formula, data)
+ function(newdata) predict(mod, newdata)
+}
+
+errorest(medv ~ ., data=BostonHousing, model=mylm,
+est.para=control.errorest(random=FALSE))
+
+
+# Survival data
+
+data("GBSG2", package = "TH.data")
+library("survival")
+
+# prediction is fitted Kaplan-Meier
+predict.survfit <- function(object, newdata) object
+
+# 5-fold cv of Kaplan-Meier for GBSG2 study
+errorest(Surv(time, cens) ~ 1, data=GBSG2, model=survfit,
+ predict=predict.survfit, est.para=control.errorest(k=5))
+
+
+}
+
+\keyword{misc}
diff --git a/man/inbagg.Rd b/man/inbagg.Rd
new file mode 100644
index 0000000..7ec048f
--- /dev/null
+++ b/man/inbagg.Rd
@@ -0,0 +1,112 @@
+\name{inbagg}
+\alias{inbagg}
+\alias{inbagg.default}
+\alias{inbagg.data.frame}
+
+\title{Indirect Bagging}
+
+\description{
+ Function to perform the indirect bagging and subagging.
+}
+
+\usage{
+\method{inbagg}{data.frame}(formula, data, pFUN=NULL,
+ cFUN=list(model = NULL, predict = NULL, training.set = NULL),
+ nbagg = 25, ns = 0.5, replace = FALSE, ...)
+}
+
+\arguments{
+ \item{formula}{formula. A \code{formula} specified as \code{y~w1+w2+w3~x1+x2+x3} describes how to model the intermediate variables \code{w1, w2, w3} and the response variable \code{y}, if no other formula is specified by the elements of \code{pFUN} or in \code{cFUN}}
+ \item{data}{data frame of explanatory, intermediate and response variables.}
+ \item{pFUN}{list of lists, which describe models for the intermediate variables, details are given below.}
+ \item{cFUN}{either a fixed function with argument \code{newdata} and returning the class membership by default, or a list specifying a classifying model, similar to one element of \code{pFUN}. Details are given below.}
+ \item{nbagg}{number of bootstrap samples.}
+ \item{ns}{proportion of sample to be drawn from the learning
+ sample. By default, subagging with 50\% is performed, i.e. draw
+ 0.5*n out of n without replacement.}
+ \item{replace}{logical. Draw with or without replacement.}
+ \item{\dots}{additional arguments (e.g. \code{subset}).}
+}
+\details{
+ A given data set is subdivided into three types of variables: explanatory, intermediate and response variables.\cr
+
+ Here, each specified intermediate variable is modelled separately
+ following \code{pFUN}, a list of lists with elements specifying an
+arbitrary number of models for the intermediate variables and an
+optional element \code{training.set = c("oob", "bag", "all")}. The
+element \code{training.set} determines whether, predictive models for
+the intermediate are calculated based on the out-of-bag sample
+(\code{"oob"}), the default, on the bag sample (\code{"bag"}) or on all
+available observations (\code{"all"}). The elements of \code{pFUN},
+specifying the models for the intermediate variables are lists as
+described in \code{\link{inclass}}.
+Note that, if no formula is given in these elements, the functional
+relationship of \code{formula} is used.\cr
+
+The response variable is modelled following \code{cFUN}.
+This can either be a fixed classifying function as described in Peters
+et al. (2003) or a list,
+which specifies the modelling technique to be applied. The list
+contains the arguments \code{model} (which model to be fitted),
+\code{predict} (optional, how to predict), \code{formula} (optional, of
+type \code{y~w1+w2+w3+x1+x2} determines the variables the classifying
+function is based on) and the optional argument \code{training.set =
+ c("fitted.bag", "original", "fitted.subset")}
+specifying whether the classifying function is trained on the predicted
+observations of the bag sample (\code{"fitted.bag"}),
+on the original observations (\code{"original"}) or on the
+predicted observations not included in a defined subset
+(\code{"fitted.subset"}). Per default the formula specified in
+\code{formula} determines the variables, the classifying function is
+based on.\cr
+
+Note that the default of \code{cFUN = list(model = NULL, training.set = "fitted.bag")}
+uses the function \code{\link[rpart]{rpart}} and
+the predict function \code{predict(object, newdata, type = "class")}.
+}
+\value{
+ An object of class \code{"inbagg"}, that is a list with elements
+ \item{mtrees}{a list of length \code{nbagg}, describing the prediction
+ models corresponding
+ to each bootstrap sample. Each element of \code{mtrees}
+ is a list with elements \code{bindx} (observations of bag sample),
+ \code{btree} (classifying function of bag sample) and \code{bfct} (predictive models for intermediates of bag sample).}
+ \item{y}{vector of response values.}
+ \item{W}{data frame of intermediate variables.}
+ \item{X}{data frame of explanatory variables.}
+}
+
+\references{
+David J. Hand, Hua Gui Li, Niall M. Adams (2001),
+Supervised classification with structured class definitions.
+\emph{Computational Statistics & Data Analysis} \bold{36},
+209--225.
+
+Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
+Diagnosis of glaucoma by indirect classifiers.
+\emph{Methods of Information in Medicine} \bold{1}, 99-103.
+}
+
+
+\seealso{\code{\link[rpart]{rpart}}, \code{\link{bagging}},
+\code{\link{lm}}}
+
+
+\examples{
+
+library("MASS")
+library("rpart")
+y <- as.factor(sample(1:2, 100, replace = TRUE))
+W <- mvrnorm(n = 200, mu = rep(0, 3), Sigma = diag(3))
+X <- mvrnorm(n = 200, mu = rep(2, 3), Sigma = diag(3))
+colnames(W) <- c("w1", "w2", "w3")
+colnames(X) <- c("x1", "x2", "x3")
+DATA <- data.frame(y, W, X)
+
+
+pFUN <- list(list(formula = w1~x1+x2, model = lm, predict = mypredict.lm),
+list(model = rpart))
+
+inbagg(y~w1+w2+w3~x1+x2+x3, data = DATA, pFUN = pFUN)
+}
+\keyword{misc}
diff --git a/man/inclass.Rd b/man/inclass.Rd
new file mode 100644
index 0000000..6644294
--- /dev/null
+++ b/man/inclass.Rd
@@ -0,0 +1,116 @@
+\name{inclass}
+\alias{inclass}
+\alias{inclass.default}
+\alias{inclass.data.frame}
+
+\title{Indirect Classification}
+
+\description{
+A framework for the indirect classification approach.
+}
+
+\usage{
+\method{inclass}{data.frame}(formula, data, pFUN = NULL, cFUN = NULL, ...)
+}
+\arguments{
+ \item{formula}{formula. A \code{formula} specified as
+ \code{y~w1+w2+w3~x1+x2+x3} models each intermediate variable
+ \code{w1, w2, w3} by \code{wi~x1+x2+x3} and the response by
+ \code{y~w1+w2+w3} if no other formulas are given in \code{pFUN} or \code{cFUN}.}
+ \item{data}{data frame of explanatory, intermediate and response variables.}
+ \item{pFUN}{list of lists, which describe models for the intermediate variables, see below for details.}
+ \item{cFUN}{either a function or a list which describes the model for the
+response variable. The function has the argument \code{newdata} only.}
+ \item{\dots}{additional arguments, passed to model fitting of the
+ response variable.}
+}
+
+\details{
+A given data set is subdivided into three types of variables: those to be
+used predicting the class (explanatory variables) those to be used defining
+the class (intermediate variables) and the class membership variable itself
+(response variable). Intermediate variables are modelled based on the
+explanatory variables, the class membership variable is defined on the
+intermediate variables.\cr
+
+Each specified intermediate variable is modelled separately
+following \code{pFUN} and a formula specified by \code{formula}.
+\code{pFUN} is a list of lists, the maximum length of
+\code{pFUN} is the number of intermediate variables. Each element of
+\code{pFUN} is a list with elements:\cr
+ \code{model} - a function with arguments \code{formula} and
+ \code{data}; \cr
+ \code{predict} - an optional function with arguments \code{object, newdata} only,
+ if \code{predict} is not specified, the predict method of \code{model}
+ is used; \cr
+ \code{formula} - specifies the formula for the corresponding
+ \code{model} (optional),
+ the formula described in \code{y~w1+w2+w3~x1+x2+x3} is used if no other is
+ specified.
+\cr
+
+The response is classified following \code{cFUN}, which is either a fixed
+function or a list as described below. The determined function \code{cFUN} assigns the intermediate (and
+explanatory) variables to a certain class membership, the list
+\code{cFUN} has the elements \code{formula, model, predict} and
+\code{training.set}. The elements \code{formula, model, predict} are
+structured as described by \code{pFUN}, the described model is
+trained on the original (intermediate variables) if \code{training.set="original"}
+or if \code{training.set = NULL}, on the fitted values if
+\code{training.set = "fitted"} or on observations not included in a
+specified subset if \code{training.set = "subset"}.
+\cr
+
+A list of prediction models corresponding to each
+intermediate variable, a predictive function for the response, a list of
+specifications for the intermediate and for the response are returned. \cr
+For a detailed description on indirect
+classification see Hand et al. (2001).
+}
+\value{
+ An object of class \code{inclass}, consisting of a list of
+ \item{model.intermediate}{list of fitted models for each intermediate
+variable.}
+ \item{model.response}{predictive model for the response variable.}
+ \item{para.intermediate}{list, where each element is again a list and specifies
+the model for each intermediate variable.}
+ \item{para.response}{a list which specifies the model for response variable.}
+}
+\references{
+David J. Hand, Hua Gui Li, Niall M. Adams (2001),
+Supervised classification with structured class definitions.
+\emph{Computational Statistics & Data Analysis} \bold{36},
+209--225.
+
+Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
+Diagnosis of glaucoma by indirect classifiers.
+\emph{Methods of Information in Medicine} \bold{1}, 99-103.
+}
+
+\seealso{\code{\link{bagging}}, \code{\link{inclass}}}
+
+\examples{
+data("Smoking", package = "ipred")
+# Set three groups of variables:
+# 1) explanatory variables are: TarY, NicY, COY, Sex, Age
+# 2) intermediate variables are: TVPS, BPNL, COHB
+# 3) response (resp) is defined by:
+
+classify <- function(data){
+ data <- data[,c("TVPS", "BPNL", "COHB")]
+ res <- t(t(data) > c(4438, 232.5, 58))
+ res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
+ res
+}
+
+response <- classify(Smoking[ ,c("TVPS", "BPNL", "COHB")])
+smoking <- data.frame(Smoking, response)
+
+formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age
+
+inclass(formula, data = smoking, pFUN = list(list(model = lm, predict =
+mypredict.lm)), cFUN = classify)
+
+}
+
+\keyword{misc}
diff --git a/man/ipred-internal.Rd b/man/ipred-internal.Rd
new file mode 100644
index 0000000..91b01db
--- /dev/null
+++ b/man/ipred-internal.Rd
@@ -0,0 +1,13 @@
+\name{ipred-internal}
+\alias{getsurv}
+\title{Internal ipred functions}
+\description{
+ Internal ipred functions.
+}
+\usage{
+getsurv(obj, times)
+}
+\details{
+ This functions are not to be called by the user.
+}
+\keyword{internal}
diff --git a/man/ipredknn.Rd b/man/ipredknn.Rd
new file mode 100644
index 0000000..b055c39
--- /dev/null
+++ b/man/ipredknn.Rd
@@ -0,0 +1,48 @@
+\name{ipredknn}
+\alias{ipredknn}
+\title{ k-Nearest Neighbour Classification }
+\description{
+ $k$-nearest neighbour classification with an interface compatible to
+\code{\link{bagging}} and \code{\link{errorest}}.
+}
+\usage{
+ipredknn(formula, data, subset, na.action, k=5, \dots)
+}
+\arguments{
+ \item{formula}{a formula of the form \code{lhs ~ rhs} where \code{lhs}
+ is the response variable and \code{rhs} a set of
+ predictors.}
+ \item{data}{optional data frame containing the variables in the
+ model formula.}
+ \item{subset}{optional vector specifying a subset of observations
+ to be used.}
+ \item{na.action}{function which indicates what should happen when
+ the data contain \code{NA}s.}
+ \item{k}{number of neighbours considered, defaults to 5.}
+ \item{...}{additional parameters.}
+}
+
+\details{
+ This is a wrapper to \code{\link[class]{knn}} in order to be able to
+ use k-NN in \code{\link{bagging}} and \code{\link{errorest}}.
+}
+
+\value{
+ An object of class \code{ipredknn}. See \code{\link{predict.ipredknn}}.
+}
+
+
+\examples{
+
+library("mlbench")
+learn <- as.data.frame(mlbench.twonorm(300))
+
+mypredict.knn <- function(object, newdata)
+ predict.ipredknn(object, newdata, type="class")
+
+errorest(classes ~., data=learn, model=ipredknn,
+ predict=mypredict.knn)
+
+
+}
+\keyword{multivariate}
diff --git a/man/kfoldcv.Rd b/man/kfoldcv.Rd
new file mode 100644
index 0000000..b73a896
--- /dev/null
+++ b/man/kfoldcv.Rd
@@ -0,0 +1,38 @@
+\name{kfoldcv}
+\alias{kfoldcv}
+\title{ Subsamples for k-fold Cross-Validation }
+\description{
+ Computes feasible sample sizes for the k groups in k-fold cv if N/k is not
+an integer.
+}
+\usage{
+kfoldcv(k, N, nlevel=NULL)
+}
+\arguments{
+ \item{k}{ number of groups. }
+ \item{N}{ total sample size. }
+ \item{nlevel}{ a vector of sample sizes for stratified sampling.}
+}
+\details{
+ If N/k is not an integer, k-fold cv is not unique. Determine meaningful
+ sample sizes.
+}
+\value{
+ A vector of length \code{k}.
+}
+\examples{
+
+# 10-fold CV with N = 91
+
+kfoldcv(10, 91)
+
+\testonly{
+k <- sample(5:15, 1)
+k
+N <- sample(50:150, 1)
+N
+stopifnot(sum(kfoldcv(k, N)) == N)
+}
+
+}
+\keyword{misc}
diff --git a/man/mypredict.lm.Rd b/man/mypredict.lm.Rd
new file mode 100644
index 0000000..2f84d2d
--- /dev/null
+++ b/man/mypredict.lm.Rd
@@ -0,0 +1,28 @@
+\name{mypredict.lm}
+\alias{mypredict.lm}
+\title{Predictions Based on Linear Models}
+\description{
+Function to predict a vector of full length (number of observations), where predictions according to missing
+explanatory values are replaced by \code{NA}.
+}
+
+\usage{
+mypredict.lm(object, newdata)
+}
+
+\arguments{
+ \item{object}{an object of class \code{lm}.}
+ \item{newdata}{matrix or data frame to be predicted according to \code{object}.}
+}
+
+\value{
+Vector of predicted values.
+}
+
+
+\note{\code{predict.lm} delivers a vector of reduced length, i.e. rows where
+explanatory variables are missing are omitted. The full length of the
+predicted observation vector is necessary in the indirect classification
+approach (\code{\link{predict.inclass}}).}
+
+\keyword{misc}
diff --git a/man/predict.bagging.Rd b/man/predict.bagging.Rd
new file mode 100644
index 0000000..98d1d8d
--- /dev/null
+++ b/man/predict.bagging.Rd
@@ -0,0 +1,85 @@
+\name{predict.classbagg}
+\alias{predict.classbagg}
+\alias{predict.regbagg}
+\alias{predict.survbagg}
+\title{ Predictions from Bagging Trees }
+\description{
+ Predict the outcome of a new observation based on multiple trees.
+}
+\usage{
+\method{predict}{classbagg}(object, newdata=NULL, type=c("class", "prob"),
+ aggregation=c("majority", "average", "weighted"), \dots)
+\method{predict}{regbagg}(object, newdata=NULL, aggregation=c("average",
+ "weighted"), \dots)
+\method{predict}{survbagg}(object, newdata=NULL,\dots)
+}
+\arguments{
+ \item{object}{object of classes \code{classbagg}, \code{regbagg} or
+ \code{survbagg}.}
+ \item{newdata}{a data frame of new observations. }
+ \item{type}{character string denoting the type of predicted value
+ returned for classification trees. Either \code{class}
+ (predicted classes are returned) or \code{prob}
+ (estimated class probabilities are returned).}
+ \item{aggregation}{character string specifying how to aggregate, see below.}
+ \item{...}{additional arguments, currently not passed to any function.}
+}
+\details{
+ There are (at least) three different ways to aggregate the predictions of
+bagging classification trees. Most famous is class majority voting
+(\code{aggregation="majority"}) where the most frequent class is returned. The
+second way is choosing the class with maximal averaged class probability
+(\code{aggregation="average"}). The third method is based on the "aggregated learning
+sample", introduced by Hothorn et al. (2003) for survival trees.
+The prediction of a new observation is the majority class, mean or
+Kaplan-Meier curve of all observations from the learning sample
+identified by the \code{nbagg} leaves containing the new observation.
+For regression trees, only averaged or weighted predictions are possible.
+
+By default, the out-of-bag estimate is computed if \code{newdata} is NOT
+specified. Therefore, the predictions of \code{predict(object)} are "honest"
+in some way (this is not possible for combined models via \code{comb} in
+\code{\link{bagging}}).
+If you like to compute the predictions for the learning sample
+itself, use \code{newdata} to specify your data.
+
+}
+\value{
+ The predicted class or estimated class probabilities are returned for
+classification trees. The predicted endpoint is returned in regression
+problems and the predicted Kaplan-Meier curve is returned for survival
+trees.
+}
+
+
+\references{
+
+Leo Breiman (1996), Bagging Predictors. \emph{Machine Learning}
+\bold{24}(2), 123--140.
+
+Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
+Radespiel-Troeger (2004), Bagging Survival Trees.
+\emph{Statistics in Medicine}, \bold{23}(1), 77--91.
+
+}
+
+\examples{
+
+data("Ionosphere", package = "mlbench")
+Ionosphere$V2 <- NULL # constant within groups
+
+# nbagg = 10 for performance reasons here
+mod <- bagging(Class ~ ., data=Ionosphere)
+
+# out-of-bag estimate
+
+mean(predict(mod) != Ionosphere$Class)
+
+# predictions for the first 10 observations
+
+predict(mod, newdata=Ionosphere[1:10,])
+
+predict(mod, newdata=Ionosphere[1:10,], type="prob")
+
+}
+\keyword{tree}
diff --git a/man/predict.inbagg.Rd b/man/predict.inbagg.Rd
new file mode 100644
index 0000000..ceb80a8
--- /dev/null
+++ b/man/predict.inbagg.Rd
@@ -0,0 +1,57 @@
+\name{predict.inbagg}
+\alias{predict.inbagg}
+
+\title{Predictions from an Inbagg Object}
+\description{
+Predicts the class membership of new observations through indirect
+bagging.
+}
+\usage{
+\method{predict}{inbagg}(object, newdata, ...)
+}
+\arguments{
+ \item{object}{object of class \code{inbagg}, see \code{\link{inbagg}}.}
+ \item{newdata}{data frame to be classified.}
+ \item{\dots}{additional argumends corresponding to the predictive models.}
+}
+\details{
+Predictions of class memberships are calculated. i.e. values of the
+intermediate variables are predicted following \code{pFUN} and classified following \code{cFUN},
+see \code{\link{inbagg}}.
+}
+\value{
+The vector of predicted classes is returned.
+}
+
+\references{
+David J. Hand, Hua Gui Li, Niall M. Adams (2001),
+Supervised classification with structured class definitions.
+\emph{Computational Statistics & Data Analysis} \bold{36},
+209--225.
+
+Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
+Diagnosis of glaucoma by indirect classifiers.
+\emph{Methods of Information in Medicine} \bold{1}, 99-103.
+}
+
+
+\seealso{\code{\link{inbagg}}}
+
+\examples{
+
+library("MASS")
+library("rpart")
+y <- as.factor(sample(1:2, 100, replace = TRUE))
+W <- mvrnorm(n = 200, mu = rep(0, 3), Sigma = diag(3))
+X <- mvrnorm(n = 200, mu = rep(2, 3), Sigma = diag(3))
+colnames(W) <- c("w1", "w2", "w3")
+colnames(X) <- c("x1", "x2", "x3")
+DATA <- data.frame(y, W, X)
+
+pFUN <- list(list(formula = w1~x1+x2, model = lm),
+list(model = rpart))
+
+RES <- inbagg(y~w1+w2+w3~x1+x2+x3, data = DATA, pFUN = pFUN)
+predict(RES, newdata = X)
+}
+\keyword{misc}
diff --git a/man/predict.inclass.Rd b/man/predict.inclass.Rd
new file mode 100644
index 0000000..6ee2b60
--- /dev/null
+++ b/man/predict.inclass.Rd
@@ -0,0 +1,119 @@
+\name{predict.inclass}
+\alias{predict.inclass}
+
+\title{Predictions from an Inclass Object}
+
+\description{
+Predicts the class membership of new observations through indirect
+classification.
+}
+
+\usage{
+\method{predict}{inclass}(object, newdata, ...)
+}
+
+\arguments{
+ \item{object}{ object of class \code{inclass}, see \code{\link{inclass}}.}
+ \item{newdata}{data frame to be classified.}
+ \item{\dots}{additional arguments corresponding to the predictive models
+specified in \code{\link{inclass}}.}
+}
+\details{
+Predictions of class memberships are calculated. i.e. values of the
+intermediate variables are predicted and classified following \code{cFUN},
+see \code{\link{inclass}}.
+}
+\value{
+The vector of predicted classes is returned.
+}
+\references{
+David J. Hand, Hua Gui Li, Niall M. Adams (2001),
+Supervised classification with structured class definitions.
+\emph{Computational Statistics & Data Analysis} \bold{36},
+209--225.
+
+Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
+Diagnosis of glaucoma by indirect classifiers.
+\emph{Methods of Information in Medicine} \bold{1}, 99-103.
+}
+
+
+\seealso{\code{\link{inclass}}}
+
+\examples{
+\dontrun{
+# Simulation model, classification rule following Hand et al. (2001)
+
+theta90 <- varset(N = 1000, sigma = 0.1, theta = 90, threshold = 0)
+
+dataset <- as.data.frame(cbind(theta90$explanatory, theta90$intermediate))
+names(dataset) <- c(colnames(theta90$explanatory),
+colnames(theta90$intermediate))
+
+classify <- function(Y, threshold = 0) {
+ Y <- Y[,c("y1", "y2")]
+ z <- (Y > threshold)
+ resp <- as.factor(ifelse((z[,1] + z[,2]) > 1, 1, 0))
+ return(resp)
+}
+
+formula <- response~y1+y2~x1+x2
+
+fit <- inclass(formula, data = dataset, pFUN = list(list(model = lm)),
+ cFUN = classify)
+
+predict(object = fit, newdata = dataset)
+
+
+data("Smoking", package = "ipred")
+
+# explanatory variables are: TarY, NicY, COY, Sex, Age
+# intermediate variables are: TVPS, BPNL, COHB
+# reponse is defined by:
+
+classify <- function(data){
+ data <- data[,c("TVPS", "BPNL", "COHB")]
+ res <- t(t(data) > c(4438, 232.5, 58))
+ res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
+ res
+}
+
+response <- classify(Smoking[ ,c("TVPS", "BPNL", "COHB")])
+smoking <- cbind(Smoking, response)
+
+formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age
+
+fit <- inclass(formula, data = smoking,
+ pFUN = list(list(model = lm)), cFUN = classify)
+
+
+predict(object = fit, newdata = smoking)
+}
+
+data("GlaucomaMVF", package = "ipred")
+library("rpart")
+glaucoma <- GlaucomaMVF[,(names(GlaucomaMVF) != "tension")]
+# explanatory variables are derived by laser scanning image and intra occular pressure
+# intermediate variables are: clv, cs, lora
+# response is defined by
+
+classify <- function (data) {
+ attach(data)
+ res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >=
+ 49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) &
+ clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) &
+ !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) |
+ (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
+ detach(data)
+ factor (res, labels = c("glaucoma", "normal"))
+}
+
+fit <- inclass(Class~clv+lora+cs~., data = glaucoma,
+ pFUN = list(list(model = rpart)), cFUN = classify)
+
+data("GlaucomaM", package = "TH.data")
+predict(object = fit, newdata = GlaucomaM)
+
+}
+
+\keyword{misc}
diff --git a/man/predict.ipredknn.Rd b/man/predict.ipredknn.Rd
new file mode 100644
index 0000000..fbeb67d
--- /dev/null
+++ b/man/predict.ipredknn.Rd
@@ -0,0 +1,29 @@
+\name{predict.ipredknn}
+\alias{predict.ipredknn}
+\title{ Predictions from k-Nearest Neighbors }
+\description{
+ Predict the class of a new observation based on k-NN.
+}
+\usage{
+\method{predict}{ipredknn}(object, newdata, type=c("prob", "class"), ...)
+}
+\arguments{
+ \item{object}{object of class \code{ipredknn}.}
+ \item{newdata}{a data frame of new observations. }
+ \item{type}{return either the predicted class or the
+ the proportion of the votes for the winning
+ class.}
+ \item{...}{additional arguments.}
+}
+\details{
+ This function is a method for the generic function \code{\link{predict}}
+for class \code{ipredknn}. For the details see \code{\link[class]{knn}}.
+
+}
+\value{
+Either the predicted class or the
+the proportion of the votes for the winning class.
+}
+
+
+\keyword{multivariate}
diff --git a/man/predict.slda.Rd b/man/predict.slda.Rd
new file mode 100644
index 0000000..c3f0675
--- /dev/null
+++ b/man/predict.slda.Rd
@@ -0,0 +1,29 @@
+\name{predict.slda}
+\alias{predict.slda}
+\title{ Predictions from Stabilised Linear Discriminant Analysis }
+\description{
+ Predict the class of a new observation based on stabilised LDA.
+}
+\usage{
+\method{predict}{slda}(object, newdata, ...)
+}
+\arguments{
+ \item{object}{object of class \code{slda}.}
+ \item{newdata}{a data frame of new observations. }
+ \item{...}{additional arguments passed to
+\code{\link[MASS]{predict.lda}}.}
+}
+\details{
+ This function is a method for the generic function \code{\link{predict}}
+for class \code{slda}. For the details see \code{\link[MASS]{predict.lda}}.
+
+}
+\value{
+A list with components
+\item{class}{the predicted class (a factor).}
+\item{posterior}{posterior probabilities for the classes.}
+\item{x}{the scores of test cases.}
+}
+
+
+\keyword{multivariate}
diff --git a/man/print.bagging.Rd b/man/print.bagging.Rd
new file mode 100644
index 0000000..1e9fae5
--- /dev/null
+++ b/man/print.bagging.Rd
@@ -0,0 +1,21 @@
+\name{print.classbagg}
+\alias{print}
+\alias{print.classbagg}
+\alias{print.regbagg}
+\alias{print.survbagg}
+\title{Print Method for Bagging Trees}
+\description{
+ Print objects returned by \code{\link{bagging}} in nice layout.
+}
+\usage{
+\method{print}{classbagg}(x, digits, \dots)
+}
+\arguments{
+ \item{x}{object returned by \code{\link{bagging}}.}
+ \item{digits}{how many digits should be printed.}
+ \item{\dots}{further arguments to be passed to or from methods.}
+}
+\value{
+ none
+}
+\keyword{tree}
diff --git a/man/print.cvclass.Rd b/man/print.cvclass.Rd
new file mode 100644
index 0000000..767fc67
--- /dev/null
+++ b/man/print.cvclass.Rd
@@ -0,0 +1,24 @@
+\name{print.cvclass}
+\alias{print.cvclass}
+\alias{print.cvreg}
+\alias{print.cvsurv}
+\alias{print.bootestclass}
+\alias{print.bootestreg}
+\alias{print.bootestsurv}
+\title{Print Method for Error Rate Estimators}
+\description{
+ Print objects returned by \code{\link{errorest}} in nice layout.
+}
+\usage{
+\method{print}{cvclass}(x, digits=4, ...)
+}
+
+\arguments{
+ \item{x}{an object returned by \code{\link{errorest}}.}
+ \item{digits}{how many digits should be printed.}
+ \item{\dots}{further arguments to be passed to or from methods.}
+}
+\value{
+ none
+}
+\keyword{misc}
diff --git a/man/print.inbagg.Rd b/man/print.inbagg.Rd
new file mode 100644
index 0000000..a5af080
--- /dev/null
+++ b/man/print.inbagg.Rd
@@ -0,0 +1,18 @@
+\name{print.inbagg}
+\alias{print.inbagg}
+\title{Print Method for Inbagg Object}
+\description{
+Print object of class \code{inbagg} in nice layout.
+}
+\usage{
+\method{print}{inbagg}(x, ...)
+}
+\arguments{
+ \item{x}{object of class \code{inbagg}.}
+ \item{\dots}{additional arguments.}
+}
+\details{
+An object of class \code{inbagg} is printed. Information about number and names of the intermediate variables,
+and the number of drawn bootstrap samples is given.
+}
+\keyword{misc}
diff --git a/man/print.inclass.Rd b/man/print.inclass.Rd
new file mode 100644
index 0000000..833d613
--- /dev/null
+++ b/man/print.inclass.Rd
@@ -0,0 +1,18 @@
+\name{print.inclass}
+\alias{print.inclass}
+\title{Print Method for Inclass Object}
+\description{
+Print object of class \code{inclass} in nice layout.
+}
+\usage{
+\method{print}{inclass}(x, ...)
+}
+\arguments{
+ \item{x}{object of class \code{inclass}.}
+ \item{\dots}{additional arguments.}
+}
+\details{
+An object of class \code{inclass} is printed. Information about number and names of the intermediate variables, the used modelling technique and the number of
+drawn bootstrap samples is given.
+}
+\keyword{misc}
diff --git a/man/prune.bagging.Rd b/man/prune.bagging.Rd
new file mode 100644
index 0000000..9883f1f
--- /dev/null
+++ b/man/prune.bagging.Rd
@@ -0,0 +1,38 @@
+\name{prune.classbagg}
+\alias{prune.classbagg}
+\alias{prune.regbagg}
+\alias{prune.survbagg}
+\title{ Pruning for Bagging }
+\description{
+ Prune each of the trees returned by \code{\link{bagging}}.
+}
+\usage{
+\method{prune}{classbagg}(tree, cp=0.01,...)
+}
+\arguments{
+ \item{tree}{ an object returned by \code{\link{bagging}}
+ (calling this \code{tree} is needed by the generic function
+ \code{prune} in package \code{rpart}).}
+ \item{cp}{complexity parameter, see \code{\link[rpart]{prune.rpart}}.}
+ \item{...}{additional arguments to \code{\link[rpart]{prune.rpart}}.}
+}
+\details{
+ By default, \code{\link{bagging}} grows classification
+ trees of maximal size. One may want to prune each tree, however,
+ it is not clear whether or not this may decrease prediction error.
+}
+\value{
+ An object of the same class as \code{tree} with the trees pruned.
+}
+\examples{
+
+data("Glass", package = "mlbench")
+library("rpart")
+
+mod <- bagging(Type ~ ., data=Glass, nbagg=10, coob=TRUE)
+pmod <- prune(mod)
+print(pmod)
+
+
+}
+\keyword{tree}
diff --git a/man/rsurv.Rd b/man/rsurv.Rd
new file mode 100644
index 0000000..8f234ed
--- /dev/null
+++ b/man/rsurv.Rd
@@ -0,0 +1,55 @@
+\name{rsurv}
+\alias{rsurv}
+\title{ Simulate Survival Data }
+\description{
+ Simulation Setup for Survival Data.
+}
+\usage{
+rsurv(N, model=c("A", "B", "C", "D", "tree"), gamma=NULL, fact=1, pnon=10,
+ gethaz=FALSE)
+}
+\arguments{
+ \item{N}{ number of observations. }
+ \item{model}{ type of model. }
+ \item{gamma}{simulate censoring time as runif(N, 0, gamma). Defaults to
+ \code{NULL} (no censoring).}
+ \item{fact}{scale parameter for \code{model=tree}.}
+ \item{pnon}{number of additional non-informative variables for the tree
+ model.}
+ \item{gethaz}{logical, indicating wheather the hazard rate for each
+ observation should be returned.}
+}
+\details{
+ Simulation setup similar to configurations used in LeBlanc and Crowley
+(1992) or Keles and Segal (2002) as well as a tree model used in Hothorn et
+al. (2004). See Hothorn et al. (2004) for the details.
+}
+\value{
+ A data frame with elements \code{time}, \code{cens}, \code{X1} ...
+\code{X5}. If \code{pnon} > 0, additional noninformative covariables are
+added. If \code{gethaz=TRUE}, the \code{hazard} attribute returns the hazard
+rates.
+}
+\references{
+
+ M. LeBlanc and J. Crowley (1992), Relative Risk Trees for
+ Censored Survival Data. \emph{Biometrics} \bold{48}, 411--425.
+
+ S. Keles and M. R. Segal (2002), Residual-based tree-structured
+ survival analysis. \emph{Statistics in Medicine}, \bold{21}, 313--326.
+
+ Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
+ Radespiel-Troeger (2004), Bagging Survival Trees.
+ \emph{Statistics in Medicine}, \bold{23}(1), 77--91.
+
+
+}
+\examples{
+
+library("survival")
+# 3*X1 + X2
+simdat <- rsurv(500, model="C")
+coxph(Surv(time, cens) ~ ., data=simdat)
+
+}
+\keyword{survival}
diff --git a/man/sbrier.Rd b/man/sbrier.Rd
new file mode 100644
index 0000000..9812a75
--- /dev/null
+++ b/man/sbrier.Rd
@@ -0,0 +1,183 @@
+\name{sbrier}
+\alias{sbrier}
+\title{ Model Fit for Survival Data }
+\description{
+ Model fit for survival data: the integrated Brier score for censored
+observations.
+}
+\usage{
+sbrier(obj, pred, btime= range(obj[,1]))
+}
+\arguments{
+ \item{obj}{an object of class \code{Surv}.}
+ \item{pred}{predicted values. Either a probability or a list of
+ \code{survfit} objects. }
+ \item{btime}{numeric vector of times, the integrated Brier score is
+ computed if this is of \code{length > 1}.
+ The Brier score at \code{btime}
+ is returned otherwise.}
+}
+\details{
+ There is no obvious criterion of model fit for censored data. The Brier
+score for censoring as well as it's integrated version were suggested by
+Graf et al (1999).
+
+The integrated Brier score is always computed over a subset of the
+interval given by the range of the time slot of the survival object \code{obj}.
+
+}
+\value{
+ The (integrated) Brier score with attribute \code{time} is returned.
+}
+\seealso{
+ More measures for the validation of predicted surival probabilities
+ are implemented in package \code{pec}.
+}
+\references{
+
+Erika Graf, Claudia Schmoor, Willi Sauerbrei and Martin Schumacher (1999),
+Assessment and comparison of prognostic classification schemes for
+survival data. \emph{Statistics in Medicine} \bold{18}(17-18), 2529--2545.
+
+}
+
+\examples{
+
+library("survival")
+data("DLBCL", package = "ipred")
+smod <- Surv(DLBCL$time, DLBCL$cens)
+
+KM <- survfit(smod ~ 1)
+# integrated Brier score up to max(DLBCL$time)
+sbrier(smod, KM)
+
+# integrated Brier score up to time=50
+sbrier(smod, KM, btime=c(0, 50))
+
+# Brier score for time=50
+sbrier(smod, KM, btime=50)
+
+# a "real" model: one single survival tree with Intern. Prognostic Index
+# and mean gene expression in the first cluster as predictors
+mod <- bagging(Surv(time, cens) ~ MGEc.1 + IPI, data=DLBCL, nbagg=1)
+
+# this is a list of survfit objects (==KM-curves), one for each observation
+# in DLBCL
+pred <- predict(mod, newdata=DLBCL)
+
+# integrated Brier score up to max(time)
+sbrier(smod, pred)
+
+# Brier score at time=50
+sbrier(smod, pred, btime=50)
+# artificial examples and illustrations
+
+cleans <- function(x) { attr(x, "time") <- NULL; names(x) <- NULL; x }
+
+n <- 100
+time <- rpois(n, 20)
+cens <- rep(1, n)
+
+# checks, Graf et al. page 2536, no censoring at all!
+# no information: \pi(t) = 0.5
+
+a <- sbrier(Surv(time, cens), rep(0.5, n), time[50])
+stopifnot(all.equal(cleans(a),0.25))
+
+# some information: \pi(t) = S(t)
+
+n <- 100
+time <- 1:100
+mod <- survfit(Surv(time, cens) ~ 1)
+a <- sbrier(Surv(time, cens), rep(list(mod), n))
+mymin <- mod$surv * (1 - mod$surv)
+cleans(a)
+sum(mymin)/diff(range(time))
+
+# independent of ordering
+rand <- sample(1:100)
+b <- sbrier(Surv(time, cens)[rand], rep(list(mod), n)[rand])
+stopifnot(all.equal(cleans(a), cleans(b)))
+
+\testonly{
+ # total information: \pi(t | X) known for every obs
+
+ time <- 1:10
+ cens <- rep(1,10)
+ pred <- diag(10)
+ pred[upper.tri(pred)] <- 1
+ diag(pred) <- 0
+ # <FIXME>
+ # a <- sbrier(Surv(time, cens), pred)
+ # stopifnot(all.equal(a, 0))
+ # </FIXME>
+}
+
+# 2 groups at different risk
+
+time <- c(1:10, 21:30)
+strata <- c(rep(1, 10), rep(2, 10))
+cens <- rep(1, length(time))
+
+# no information about the groups
+
+a <- sbrier(Surv(time, cens), survfit(Surv(time, cens) ~ 1))
+b <- sbrier(Surv(time, cens), rep(list(survfit(Surv(time, cens) ~1)), 20))
+stopifnot(all.equal(a, b))
+
+# risk groups known
+
+mod <- survfit(Surv(time, cens) ~ strata)
+b <- sbrier(Surv(time, cens), c(rep(list(mod[1]), 10), rep(list(mod[2]), 10)))
+stopifnot(a > b)
+
+### GBSG2 data
+data("GBSG2", package = "TH.data")
+
+thsum <- function(x) {
+ ret <- c(median(x), quantile(x, 0.25), quantile(x,0.75))
+ names(ret)[1] <- "Median"
+ ret
+}
+
+t(apply(GBSG2[,c("age", "tsize", "pnodes",
+ "progrec", "estrec")], 2, thsum))
+
+table(GBSG2$menostat)
+table(GBSG2$tgrade)
+table(GBSG2$horTh)
+
+# pooled Kaplan-Meier
+
+mod <- survfit(Surv(time, cens) ~ 1, data=GBSG2)
+# integrated Brier score
+sbrier(Surv(GBSG2$time, GBSG2$cens), mod)
+# Brier score at 5 years
+sbrier(Surv(GBSG2$time, GBSG2$cens), mod, btime=1825)
+
+# Nottingham prognostic index
+
+GBSG2 <- GBSG2[order(GBSG2$time),]
+
+NPI <- 0.2*GBSG2$tsize/10 + 1 + as.integer(GBSG2$tgrade)
+NPI[NPI < 3.4] <- 1
+NPI[NPI >= 3.4 & NPI <=5.4] <- 2
+NPI[NPI > 5.4] <- 3
+
+mod <- survfit(Surv(time, cens) ~ NPI, data=GBSG2)
+plot(mod)
+
+pred <- c()
+survs <- c()
+for (i in sort(unique(NPI)))
+ survs <- c(survs, getsurv(mod[i], 1825))
+
+for (i in 1:nrow(GBSG2))
+ pred <- c(pred, survs[NPI[i]])
+
+# Brier score of NPI at t=5 years
+sbrier(Surv(GBSG2$time, GBSG2$cens), pred, btime=1825)
+
+
+}
+\keyword{survival}
diff --git a/man/slda.Rd b/man/slda.Rd
new file mode 100644
index 0000000..d621ccc
--- /dev/null
+++ b/man/slda.Rd
@@ -0,0 +1,96 @@
+\name{slda}
+\alias{slda}
+\alias{slda.default}
+\alias{slda.formula}
+\alias{slda.factor}
+\title{ Stabilised Linear Discriminant Analysis }
+\description{
+ Linear discriminant analysis based on left-spherically
+ distributed linear scores.
+}
+\usage{
+\method{slda}{formula}(formula, data, subset, na.action=na.rpart, \dots)
+\method{slda}{factor}(y, X, q=NULL, \dots)
+}
+\arguments{
+ \item{y}{the response variable: a factor vector of class labels.}
+ \item{X}{a data frame of predictor variables.}
+ \item{q}{the number of positive eigenvalues the scores are derived from,
+ see below.}
+ \item{formula}{a formula of the form \code{lhs ~ rhs} where \code{lhs}
+ is the response variable and \code{rhs} a set of
+ predictors.}
+ \item{data}{optional data frame containing the variables in the
+ model formula.}
+ \item{subset}{optional vector specifying a subset of observations
+ to be used.}
+ \item{na.action}{function which indicates what should happen when
+ the data contain \code{NA}s. Defaults to
+ \code{\link[rpart]{na.rpart}}.}
+ \item{...}{additional parameters passed to \code{\link[MASS]{lda}}.}
+}
+
+\details{
+ This function implements the LDA for \eqn{q}-dimensional linear scores of
+the original \eqn{p} predictors derived from the \eqn{PC_q} rule by Laeuter
+et al. (1998). Based on the product sum matrix
+\deqn{W = (X - \bar{X})^\top(X - \bar{X})}
+the eigenvalue problem \eqn{WD = diag(W)DL} is solved. The first \eqn{q}
+columns \eqn{D_q} of \eqn{D} are used as a weight matrix for the
+original \eqn{p} predictors: \eqn{XD_q}. By default, \eqn{q} is the number
+of eigenvalues greater one. The \eqn{q}-dimensional linear scores are
+left-spherically distributed and are used as predictors for a classical
+LDA.
+
+This form of reduction of the dimensionality was
+developed for discriminant analysis problems by Laeuter (1992) and was used
+for multivariate tests by Laeuter et al. (1998), Kropf (2000) gives an
+overview. For details on left-spherically distributions see Fang and
+Zhang (1990).
+
+}
+
+\value{
+ An object of class \code{slda}, a list with components
+ \item{scores}{the weight matrix.}
+ \item{mylda}{an object of class \code{lda}.}
+}
+
+\seealso{
+ \code{\link{predict.slda}}
+}
+
+\references{
+
+Fang Kai-Tai and Zhang Yao-Ting (1990), \emph{Generalized Multivariate
+Analysis}, Springer, Berlin.
+
+Siegfried Kropf (2000), \emph{Hochdimensionale multivariate Verfahren in der
+medizinischen Statistik}, Shaker Verlag, Aachen (in german).
+
+Juergen Laeuter (1992), \emph{Stabile multivariate Verfahren},
+Akademie Verlag, Berlin (in german).
+
+Juergen Laeuter, Ekkehard Glimm and Siegfried Kropf (1998), Multivariate
+Tests Based on Left-Spherically Distributed Linear Scores. \emph{The Annals
+of Statistics}, \bold{26}(5) 1972--1988.
+
+
+
+}
+
+\examples{
+
+library("mlbench")
+library("MASS")
+learn <- as.data.frame(mlbench.twonorm(100))
+test <- as.data.frame(mlbench.twonorm(1000))
+
+mlda <- lda(classes ~ ., data=learn)
+mslda <- slda(classes ~ ., data=learn)
+
+print(mean(predict(mlda, newdata=test)$class != test$classes))
+print(mean(predict(mslda, newdata=test)$class != test$classes))
+
+}
+\keyword{multivariate}
diff --git a/man/summary.bagging.Rd b/man/summary.bagging.Rd
new file mode 100644
index 0000000..75dc58f
--- /dev/null
+++ b/man/summary.bagging.Rd
@@ -0,0 +1,23 @@
+\name{summary.classbagg}
+\alias{summary.classbagg}
+\alias{summary.regbagg}
+\alias{summary.survbagg}
+\alias{print.summary.bagging}
+\title{Summarising Bagging}
+\description{
+ \code{summary} method for objects returned by \code{\link{bagging}}.
+}
+\usage{
+\method{summary}{classbagg}(object, \dots)
+}
+\arguments{
+ \item{object}{object returned by \code{\link{bagging}}.}
+ \item{\dots}{further arguments to be passed to or from methods.}
+}
+\details{
+ A representation of all trees in the object is printed.
+}
+\value{
+ none
+}
+\keyword{tree}
diff --git a/man/summary.inbagg.Rd b/man/summary.inbagg.Rd
new file mode 100644
index 0000000..a008703
--- /dev/null
+++ b/man/summary.inbagg.Rd
@@ -0,0 +1,30 @@
+\name{summary.inbagg}
+\alias{summary.inbagg}
+\alias{print.summary.inbagg}
+\title{Summarising Inbagg}
+\description{
+Summary of inbagg is returned.
+}
+\usage{
+\method{summary}{inbagg}(object, ...)
+}
+\arguments{
+ \item{object}{an object of class \code{inbagg}.}
+ \item{\dots}{additional arguments.}
+}
+\details{
+A representation of an indirect bagging model
+(the intermediates variables, the number of bootstrap samples, the trees) is printed.
+}
+\value{
+none
+}
+
+
+
+
+\seealso{\code{\link{print.summary.inbagg}}}
+
+
+\keyword{misc}
+
diff --git a/man/summary.inclass.Rd b/man/summary.inclass.Rd
new file mode 100644
index 0000000..adb48ea
--- /dev/null
+++ b/man/summary.inclass.Rd
@@ -0,0 +1,31 @@
+\name{summary.inclass}
+\alias{summary.inclass}
+\alias{print.summary.inclass}
+\title{Summarising Inclass}
+\description{
+Summary of inclass is returned.
+}
+\usage{
+\method{summary}{inclass}(object, ...)
+}
+\arguments{
+ \item{object}{an object of class \code{inclass}.}
+ \item{\dots}{additional arguments.}
+}
+\details{
+A representation of an indirect classification model
+(the intermediates variables, which modelling technique is used and the
+prediction model) is printed.
+}
+\value{
+none
+}
+
+
+
+
+\seealso{\code{\link{print.summary.inclass}}}
+
+
+\keyword{misc}
+
diff --git a/man/varset.Rd b/man/varset.Rd
new file mode 100644
index 0000000..1c65470
--- /dev/null
+++ b/man/varset.Rd
@@ -0,0 +1,58 @@
+\name{varset}
+\alias{varset}
+
+\title{Simulation Model}
+\description{
+Three sets of variables are calculated: explanatory, intermediate and response variables.
+}
+\usage{
+varset(N, sigma=0.1, theta=90, threshold=0, u=1:3)
+}
+
+\arguments{
+ \item{N}{number of simulated observations.}
+ \item{sigma}{standard deviation of the error term.}
+ \item{theta}{angle between two u vectors.}
+ \item{threshold}{cutpoint for classifying to 0 or 1.}
+ \item{u}{starting values.}
+}
+\details{
+
+For each observation values of two explanatory variables \eqn{x = (x_1, x_2)^{\top}} and of two responses \eqn{y = (y_1, y_2)^{\top}} are simulated, following the formula:
+\deqn{
+y = U*x+e = ({u_1^{\top} \atop u_2^{\top}})*x+e
+}
+where x is the evaluation of as standard normal random variable and e is generated by a normal variable with standard deviation \code{sigma}. U is a 2*2 Matrix, where
+\deqn{
+u_1 = ({u_{1, 1} \atop u_{1, 2}}),
+u_2 = ({u_{2, 1} \atop u_{2, 2}}),
+||u_1|| = ||u_2|| = 1,
+}
+i.e. a matrix of two normalised vectors.
+}
+\value{
+ A list containing the following arguments
+ \item{explanatory}{N*2 matrix of 2 explanatory variables.}
+ \item{intermediate}{N*2 matrix of 2 intermediate variables.}
+ \item{response}{response vectors with values 0 or 1.}
+}
+
+\references{
+David J. Hand, Hua Gui Li, Niall M. Adams (2001),
+Supervised classification with structured class definitions.
+\emph{Computational Statistics & Data Analysis} \bold{36},
+209--225.
+}
+
+
+\examples{
+
+theta90 <- varset(N = 1000, sigma = 0.1, theta = 90, threshold = 0)
+theta0 <- varset(N = 1000, sigma = 0.1, theta = 0, threshold = 0)
+par(mfrow = c(1, 2))
+plot(theta0$intermediate)
+plot(theta90$intermediate)
+
+}
+
+\keyword{misc}
diff --git a/src/SdiffKM.c b/src/SdiffKM.c
new file mode 100644
index 0000000..9fa2695
--- /dev/null
+++ b/src/SdiffKM.c
@@ -0,0 +1,94 @@
+/*
+
+ $Id: SdiffKM.c,v 1.2 2003/03/27 13:36:02 hothorn Exp $
+
+ SdiffKM: integrated squared difference between survival curve
+ and KM estimator
+
+*/
+
+#include <R.h>
+#include <Rinternals.h>
+#include <Rmath.h>
+
+SEXP SdiffKM(SEXP time, SEXP prob, SEXP args)
+{
+ SEXP rint;
+ double d, p, helpone, helptwo, k;
+ double myint = 0.0;
+ double hazard, window, tw;
+ int i, j, n;
+
+ /*
+ check arguments
+ */
+
+ if (!isVector(time))
+ error("Argument time is not a vector");
+ n = LENGTH(time);
+ if (REAL(time)[0] != 0.0)
+ error("time[1] must be zero");
+
+ if (!isVector(prob))
+ error("Argument prob is not a vector");
+ if (REAL(prob)[0] > 1.0)
+ error("prob[1] must be less or equal 1");
+ if (REAL(prob)[0] < 0.0)
+ error("prob[1] must be greater or equal 0");
+ if (LENGTH(prob) != n)
+ error("prob and time differ in their length");
+
+ if (!isVector(args) || LENGTH(args) != 2)
+ error("Argument args must be vector with two elements");
+
+ hazard = REAL(args)[0];
+ window = REAL(args)[1];
+
+ /*
+ prepare for return values
+ */
+
+ PROTECT(rint = allocVector(REALSXP, 1));
+ UNPROTECT(1);
+ REAL(rint)[0] = 0.0;
+
+ /*
+ for all discrete times
+ */
+
+ for (i = 0; i < n-1; i++) {
+
+ /* get difference between times */
+ d = REAL(time)[i+1] - REAL(time)[i];
+
+ /* estimated survival probability at this time */
+ p = REAL(prob)[i];
+
+ /* if the difference is small enough */
+ if (d < window) {
+ helpone = p - exp(-REAL(time)[i] * hazard);
+ helptwo = p - exp(-REAL(time)[i+1] * hazard);
+
+ /* mean of over and under sum */
+ myint += 0.5 * d * (helpone*helpone + helptwo*helptwo);
+
+ } else {
+
+ /* split up in smaller pieces */
+ k = ftrunc(d/window) + 1;
+ tw = d/k;
+ for (j = 0; j < k; j++) {
+ helpone = p - exp(-(REAL(time)[i] + j*tw)*hazard);
+ helptwo = p - exp(-(REAL(time)[i] + (j+1)*tw)*hazard);
+
+ /* mean of over and under sum for all small windows */
+ myint += 0.5*tw*(helpone*helpone + helptwo*helptwo);
+ }
+ }
+ }
+
+ /* ok, get outa here */
+
+ REAL(rint)[0] = myint;
+ return(rint);
+}
diff --git a/src/init.c b/src/init.c
new file mode 100644
index 0000000..ab603d4
--- /dev/null
+++ b/src/init.c
@@ -0,0 +1,23 @@
+
+#include <R.h>
+#include <Rinternals.h>
+#include <stdlib.h> // for NULL
+#include <R_ext/Rdynload.h>
+
+/* FIXME:
+ Check these declarations against the C/Fortran source code.
+*/
+
+/* .Call calls */
+extern SEXP SdiffKM(SEXP, SEXP, SEXP);
+
+static const R_CallMethodDef CallEntries[] = {
+ {"SdiffKM", (DL_FUNC) &SdiffKM, 3},
+ {NULL, NULL, 0}
+};
+
+void R_init_ipred(DllInfo *dll)
+{
+ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+ R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/tests/Examples/ipred-Ex.Rout.save b/tests/Examples/ipred-Ex.Rout.save
new file mode 100644
index 0000000..74cfa8f
--- /dev/null
+++ b/tests/Examples/ipred-Ex.Rout.save
@@ -0,0 +1,1188 @@
+
+R version 3.3.2 (2016-10-31) -- "Sincere Pumpkin Patch"
+Copyright (C) 2016 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+ Natural language support but running in an English locale
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> pkgname <- "ipred"
+> source(file.path(R.home("share"), "R", "examples-header.R"))
+> options(warn = 1)
+> library('ipred')
+>
+> base::assign(".oldSearch", base::search(), pos = 'CheckExEnv')
+> cleanEx()
+> nameEx("DLBCL")
+> ### * DLBCL
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: DLBCL
+> ### Title: Diffuse Large B-Cell Lymphoma
+> ### Aliases: DLBCL
+> ### Keywords: datasets
+>
+> ### ** Examples
+>
+>
+> set.seed(290875)
+>
+> data("DLBCL", package="ipred")
+> library("survival")
+> survfit(Surv(time, cens) ~ 1, data=DLBCL)
+Call: survfit(formula = Surv(time, cens) ~ 1, data = DLBCL)
+
+ n events median 0.95LCL 0.95UCL
+ 40.0 22.0 36.0 15.5 NA
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:survival’
+
+> nameEx("GlaucomaMVF")
+> ### * GlaucomaMVF
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: GlaucomaMVF
+> ### Title: Glaucoma Database
+> ### Aliases: GlaucomaMVF
+> ### Keywords: datasets
+>
+> ### ** Examples
+>
+> ## Not run:
+> ##D
+> ##D data("GlaucomaMVF", package = "ipred")
+> ##D library("rpart")
+> ##D
+> ##D response <- function (data) {
+> ##D attach(data)
+> ##D res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >=
+> ##D 49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) &
+> ##D clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) &
+> ##D !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) |
+> ##D (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
+> ##D detach(data)
+> ##D factor (res, labels = c("glaucoma", "normal"))
+> ##D }
+> ##D
+> ##D errorest(Class~clv+lora+cs~., data = GlaucomaMVF, model=inclass,
+> ##D estimator="cv", pFUN = list(list(model = rpart)), cFUN = response)
+> ## End(Not run)
+>
+>
+>
+> cleanEx()
+> nameEx("bagging")
+> ### * bagging
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: bagging
+> ### Title: Bagging Classification, Regression and Survival Trees
+> ### Aliases: bagging ipredbagg ipredbagg.factor ipredbagg.integer
+> ### ipredbagg.numeric ipredbagg.Surv ipredbagg.default bagging.data.frame
+> ### bagging.default
+> ### Keywords: tree
+>
+> ### ** Examples
+>
+>
+> library("MASS")
+> library("survival")
+>
+> # Classification: Breast Cancer data
+>
+> data("BreastCancer", package = "mlbench")
+>
+> # Test set error bagging (nbagg = 50): 3.7% (Breiman, 1998, Table 5)
+>
+> mod <- bagging(Class ~ Cl.thickness + Cell.size
++ + Cell.shape + Marg.adhesion
++ + Epith.c.size + Bare.nuclei
++ + Bl.cromatin + Normal.nucleoli
++ + Mitoses, data=BreastCancer, coob=TRUE)
+> print(mod)
+
+Bagging classification trees with 25 bootstrap replications
+
+Call: bagging.data.frame(formula = Class ~ Cl.thickness + Cell.size +
+ Cell.shape + Marg.adhesion + Epith.c.size + Bare.nuclei +
+ Bl.cromatin + Normal.nucleoli + Mitoses, data = BreastCancer,
+ coob = TRUE)
+
+Out-of-bag estimate of misclassification error: 0.0381
+
+>
+> # Test set error bagging (nbagg=50): 7.9% (Breiman, 1996a, Table 2)
+> data("Ionosphere", package = "mlbench")
+> Ionosphere$V2 <- NULL # constant within groups
+>
+> bagging(Class ~ ., data=Ionosphere, coob=TRUE)
+
+Bagging classification trees with 25 bootstrap replications
+
+Call: bagging.data.frame(formula = Class ~ ., data = Ionosphere, coob = TRUE)
+
+Out-of-bag estimate of misclassification error: 0.0912
+
+>
+> # Double-Bagging: combine LDA and classification trees
+>
+> # predict returns the linear discriminant values, i.e. linear combinations
+> # of the original predictors
+>
+> comb.lda <- list(list(model=lda, predict=function(obj, newdata)
++ predict(obj, newdata)$x))
+>
+> # Note: out-of-bag estimator is not available in this situation, use
+> # errorest
+>
+> mod <- bagging(Class ~ ., data=Ionosphere, comb=comb.lda)
+>
+> predict(mod, Ionosphere[1:10,])
+ [1] good bad good bad good bad good bad good bad
+Levels: bad good
+>
+> # Regression:
+>
+> data("BostonHousing", package = "mlbench")
+>
+> # Test set error (nbagg=25, trees pruned): 3.41 (Breiman, 1996a, Table 8)
+>
+> mod <- bagging(medv ~ ., data=BostonHousing, coob=TRUE)
+> print(mod)
+
+Bagging regression trees with 25 bootstrap replications
+
+Call: bagging.data.frame(formula = medv ~ ., data = BostonHousing,
+ coob = TRUE)
+
+Out-of-bag estimate of root mean squared error: 4.0618
+
+>
+> library("mlbench")
+> learn <- as.data.frame(mlbench.friedman1(200))
+>
+> # Test set error (nbagg=25, trees pruned): 2.47 (Breiman, 1996a, Table 8)
+>
+> mod <- bagging(y ~ ., data=learn, coob=TRUE)
+> print(mod)
+
+Bagging regression trees with 25 bootstrap replications
+
+Call: bagging.data.frame(formula = y ~ ., data = learn, coob = TRUE)
+
+Out-of-bag estimate of root mean squared error: 2.8532
+
+>
+> # Survival data
+>
+> # Brier score for censored data estimated by
+> # 10 times 10-fold cross-validation: 0.2 (Hothorn et al,
+> # 2002)
+>
+> data("DLBCL", package = "ipred")
+> mod <- bagging(Surv(time,cens) ~ MGEc.1 + MGEc.2 + MGEc.3 + MGEc.4 + MGEc.5 +
++ MGEc.6 + MGEc.7 + MGEc.8 + MGEc.9 +
++ MGEc.10 + IPI, data=DLBCL, coob=TRUE)
+>
+> print(mod)
+
+Bagging survival trees with 25 bootstrap replications
+
+Call: bagging.data.frame(formula = Surv(time, cens) ~ MGEc.1 + MGEc.2 +
+ MGEc.3 + MGEc.4 + MGEc.5 + MGEc.6 + MGEc.7 + MGEc.8 + MGEc.9 +
+ MGEc.10 + IPI, data = DLBCL, coob = TRUE)
+
+Out-of-bag estimate of Brier's score: 0.2098
+
+>
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:mlbench’, ‘package:survival’, ‘package:MASS’
+
+> nameEx("dystrophy")
+> ### * dystrophy
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: dystrophy
+> ### Title: Detection of muscular dystrophy carriers.
+> ### Aliases: dystrophy
+> ### Keywords: datasets
+>
+> ### ** Examples
+>
+> ## Not run:
+> ##D
+> ##D data("dystrophy")
+> ##D library("rpart")
+> ##D errorest(Class~CK+H~AGE+PK+LD, data = dystrophy, model = inbagg,
+> ##D pFUN = list(list(model = lm, predict = mypredict.lm), list(model = rpart)),
+> ##D ns = 0.75, estimator = "cv")
+> ## End(Not run)
+>
+>
+>
+> cleanEx()
+> nameEx("errorest")
+> ### * errorest
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: errorest
+> ### Title: Estimators of Prediction Error
+> ### Aliases: errorest errorest.data.frame errorest.default
+> ### Keywords: misc
+>
+> ### ** Examples
+>
+>
+> # Classification
+>
+> data("iris")
+> library("MASS")
+>
+> # force predict to return class labels only
+> mypredict.lda <- function(object, newdata)
++ predict(object, newdata = newdata)$class
+>
+> # 10-fold cv of LDA for Iris data
+> errorest(Species ~ ., data=iris, model=lda,
++ estimator = "cv", predict= mypredict.lda)
+
+Call:
+errorest.data.frame(formula = Species ~ ., data = iris, model = lda,
+ predict = mypredict.lda, estimator = "cv")
+
+ 10-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.02
+
+>
+> data("PimaIndiansDiabetes", package = "mlbench")
+> ## Not run:
+> ##D # 632+ bootstrap of LDA for Diabetes data
+> ##D errorest(diabetes ~ ., data=PimaIndiansDiabetes, model=lda,
+> ##D estimator = "632plus", predict= mypredict.lda)
+> ## End(Not run)
+>
+> #cv of a fixed partition of the data
+> list.tindx <- list(1:100, 101:200, 201:300, 301:400, 401:500,
++ 501:600, 601:700, 701:768)
+>
+> errorest(diabetes ~ ., data=PimaIndiansDiabetes, model=lda,
++ estimator = "cv", predict = mypredict.lda,
++ est.para = control.errorest(list.tindx = list.tindx))
+
+Call:
+errorest.data.frame(formula = diabetes ~ ., data = PimaIndiansDiabetes,
+ model = lda, predict = mypredict.lda, estimator = "cv", est.para = control.errorest(list.tindx = list.tindx))
+
+ 8-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.2227
+
+>
+> ## Not run:
+> ##D #both bootstrap estimations based on fixed partitions
+> ##D
+> ##D list.tindx <- vector(mode = "list", length = 25)
+> ##D for(i in 1:25) {
+> ##D list.tindx[[i]] <- sample(1:768, 768, TRUE)
+> ##D }
+> ##D
+> ##D errorest(diabetes ~ ., data=PimaIndiansDiabetes, model=lda,
+> ##D estimator = c("boot", "632plus"), predict= mypredict.lda,
+> ##D est.para = control.errorest(list.tindx = list.tindx))
+> ##D
+> ## End(Not run)
+> data("Glass", package = "mlbench")
+>
+> # LDA has cross-validated misclassification error of
+> # 38% (Ripley, 1996, page 98)
+>
+> # Pruned trees about 32% (Ripley, 1996, page 230)
+>
+> # use stratified sampling here, i.e. preserve the class proportions
+> errorest(Type ~ ., data=Glass, model=lda,
++ predict=mypredict.lda, est.para=control.errorest(strat=TRUE))
+
+Call:
+errorest.data.frame(formula = Type ~ ., data = Glass, model = lda,
+ predict = mypredict.lda, est.para = control.errorest(strat = TRUE))
+
+ 10-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.3785
+
+>
+> # force predict to return class labels
+> mypredict.rpart <- function(object, newdata)
++ predict(object, newdata = newdata,type="class")
+>
+> library("rpart")
+> pruneit <- function(formula, ...)
++ prune(rpart(formula, ...), cp =0.01)
+>
+> errorest(Type ~ ., data=Glass, model=pruneit,
++ predict=mypredict.rpart, est.para=control.errorest(strat=TRUE))
+
+Call:
+errorest.data.frame(formula = Type ~ ., data = Glass, model = pruneit,
+ predict = mypredict.rpart, est.para = control.errorest(strat = TRUE))
+
+ 10-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.3178
+
+>
+> # compute sensitivity and specifity for stabilised LDA
+>
+> data("GlaucomaM", package = "TH.data")
+>
+> error <- errorest(Class ~ ., data=GlaucomaM, model=slda,
++ predict=mypredict.lda, est.para=control.errorest(predictions=TRUE))
+>
+> # sensitivity
+>
+> mean(error$predictions[GlaucomaM$Class == "glaucoma"] == "glaucoma")
+[1] 0.8163265
+>
+> # specifity
+>
+> mean(error$predictions[GlaucomaM$Class == "normal"] == "normal")
+[1] 0.8367347
+>
+> # Indirect Classification: Smoking data
+>
+> data(Smoking)
+> # Set three groups of variables:
+> # 1) explanatory variables are: TarY, NicY, COY, Sex, Age
+> # 2) intermediate variables are: TVPS, BPNL, COHB
+> # 3) response (resp) is defined by:
+>
+> resp <- function(data){
++ data <- data[, c("TVPS", "BPNL", "COHB")]
++ res <- t(t(data) > c(4438, 232.5, 58))
++ res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
++ res
++ }
+>
+> response <- resp(Smoking[ ,c("TVPS", "BPNL", "COHB")])
+> smoking <- cbind(Smoking, response)
+>
+> formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age
+>
+> # Estimation per leave-one-out estimate for the misclassification is
+> # 36.36% (Hand et al., 2001), using indirect classification with
+> # linear models
+> ## Not run:
+> ##D errorest(formula, data = smoking, model = inclass,estimator = "cv",
+> ##D pFUN = list(list(model=lm, predict = mypredict.lm)), cFUN = resp,
+> ##D est.para=control.errorest(k=nrow(smoking)))
+> ## End(Not run)
+>
+> # Regression
+>
+> data("BostonHousing", package = "mlbench")
+>
+> # 10-fold cv of lm for Boston Housing data
+> errorest(medv ~ ., data=BostonHousing, model=lm,
++ est.para=control.errorest(random=FALSE))
+
+Call:
+errorest.data.frame(formula = medv ~ ., data = BostonHousing,
+ model = lm, est.para = control.errorest(random = FALSE))
+
+ 10-fold cross-validation estimator of root mean squared error
+
+Root mean squared error: 5.877
+
+>
+> # the same, with "model" returning a function for prediction
+> # instead of an object of class "lm"
+>
+> mylm <- function(formula, data) {
++ mod <- lm(formula, data)
++ function(newdata) predict(mod, newdata)
++ }
+>
+> errorest(medv ~ ., data=BostonHousing, model=mylm,
++ est.para=control.errorest(random=FALSE))
+
+Call:
+errorest.data.frame(formula = medv ~ ., data = BostonHousing,
+ model = mylm, est.para = control.errorest(random = FALSE))
+
+ 10-fold cross-validation estimator of root mean squared error
+
+Root mean squared error: 5.877
+
+>
+>
+> # Survival data
+>
+> data("GBSG2", package = "TH.data")
+> library("survival")
+>
+> # prediction is fitted Kaplan-Meier
+> predict.survfit <- function(object, newdata) object
+>
+> # 5-fold cv of Kaplan-Meier for GBSG2 study
+> errorest(Surv(time, cens) ~ 1, data=GBSG2, model=survfit,
++ predict=predict.survfit, est.para=control.errorest(k=5))
+
+Call:
+errorest.data.frame(formula = Surv(time, cens) ~ 1, data = GBSG2,
+ model = survfit, predict = predict.survfit, est.para = control.errorest(k = 5))
+
+ 5-fold cross-validation estimator of Brier's score
+
+Brier's score: 0.1927
+
+>
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:survival’, ‘package:rpart’, ‘package:MASS’
+
+> nameEx("inbagg")
+> ### * inbagg
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: inbagg
+> ### Title: Indirect Bagging
+> ### Aliases: inbagg inbagg.default inbagg.data.frame
+> ### Keywords: misc
+>
+> ### ** Examples
+>
+>
+> library("MASS")
+> library("rpart")
+> y <- as.factor(sample(1:2, 100, replace = TRUE))
+> W <- mvrnorm(n = 200, mu = rep(0, 3), Sigma = diag(3))
+> X <- mvrnorm(n = 200, mu = rep(2, 3), Sigma = diag(3))
+> colnames(W) <- c("w1", "w2", "w3")
+> colnames(X) <- c("x1", "x2", "x3")
+> DATA <- data.frame(y, W, X)
+>
+>
+> pFUN <- list(list(formula = w1~x1+x2, model = lm, predict = mypredict.lm),
++ list(model = rpart))
+>
+> inbagg(y~w1+w2+w3~x1+x2+x3, data = DATA, pFUN = pFUN)
+
+ Indirect bagging, with 25 bootstrap samples and intermediate variables:
+ w1 w2 w3
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:rpart’, ‘package:MASS’
+
+> nameEx("inclass")
+> ### * inclass
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: inclass
+> ### Title: Indirect Classification
+> ### Aliases: inclass inclass.default inclass.data.frame
+> ### Keywords: misc
+>
+> ### ** Examples
+>
+> data("Smoking", package = "ipred")
+> # Set three groups of variables:
+> # 1) explanatory variables are: TarY, NicY, COY, Sex, Age
+> # 2) intermediate variables are: TVPS, BPNL, COHB
+> # 3) response (resp) is defined by:
+>
+> classify <- function(data){
++ data <- data[,c("TVPS", "BPNL", "COHB")]
++ res <- t(t(data) > c(4438, 232.5, 58))
++ res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
++ res
++ }
+>
+> response <- classify(Smoking[ ,c("TVPS", "BPNL", "COHB")])
+> smoking <- data.frame(Smoking, response)
+>
+> formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age
+>
+> inclass(formula, data = smoking, pFUN = list(list(model = lm, predict =
++ mypredict.lm)), cFUN = classify)
+
+ Indirect classification, with 3 intermediate variables:
+ TVPS BPNL COHB
+
+ Predictive model per intermediate is lm
+>
+>
+>
+>
+> cleanEx()
+> nameEx("ipredknn")
+> ### * ipredknn
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: ipredknn
+> ### Title: k-Nearest Neighbour Classification
+> ### Aliases: ipredknn
+> ### Keywords: multivariate
+>
+> ### ** Examples
+>
+>
+> library("mlbench")
+> learn <- as.data.frame(mlbench.twonorm(300))
+>
+> mypredict.knn <- function(object, newdata)
++ predict.ipredknn(object, newdata, type="class")
+>
+> errorest(classes ~., data=learn, model=ipredknn,
++ predict=mypredict.knn)
+
+Call:
+errorest.data.frame(formula = classes ~ ., data = learn, model = ipredknn,
+ predict = mypredict.knn)
+
+ 10-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.0533
+
+>
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:mlbench’
+
+> nameEx("kfoldcv")
+> ### * kfoldcv
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: kfoldcv
+> ### Title: Subsamples for k-fold Cross-Validation
+> ### Aliases: kfoldcv
+> ### Keywords: misc
+>
+> ### ** Examples
+>
+>
+> # 10-fold CV with N = 91
+>
+> kfoldcv(10, 91)
+ [1] 10 9 9 9 9 9 9 9 9 9
+>
+> ## Don't show:
+> k <- sample(5:15, 1)
+> k
+[1] 7
+> N <- sample(50:150, 1)
+> N
+[1] 87
+> stopifnot(sum(kfoldcv(k, N)) == N)
+> ## End(Don't show)
+>
+>
+>
+>
+> cleanEx()
+> nameEx("predict.bagging")
+> ### * predict.bagging
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: predict.classbagg
+> ### Title: Predictions from Bagging Trees
+> ### Aliases: predict.classbagg predict.regbagg predict.survbagg
+> ### Keywords: tree
+>
+> ### ** Examples
+>
+>
+> data("Ionosphere", package = "mlbench")
+> Ionosphere$V2 <- NULL # constant within groups
+>
+> # nbagg = 10 for performance reasons here
+> mod <- bagging(Class ~ ., data=Ionosphere)
+>
+> # out-of-bag estimate
+>
+> mean(predict(mod) != Ionosphere$Class)
+[1] 0.07977208
+>
+> # predictions for the first 10 observations
+>
+> predict(mod, newdata=Ionosphere[1:10,])
+ [1] good bad good bad good bad good bad good bad
+Levels: bad good
+>
+> predict(mod, newdata=Ionosphere[1:10,], type="prob")
+ bad good
+ [1,] 0.00 1.00
+ [2,] 1.00 0.00
+ [3,] 0.00 1.00
+ [4,] 0.64 0.36
+ [5,] 0.00 1.00
+ [6,] 1.00 0.00
+ [7,] 0.00 1.00
+ [8,] 1.00 0.00
+ [9,] 0.00 1.00
+[10,] 1.00 0.00
+>
+>
+>
+>
+> cleanEx()
+> nameEx("predict.inbagg")
+> ### * predict.inbagg
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: predict.inbagg
+> ### Title: Predictions from an Inbagg Object
+> ### Aliases: predict.inbagg
+> ### Keywords: misc
+>
+> ### ** Examples
+>
+>
+> library("MASS")
+> library("rpart")
+> y <- as.factor(sample(1:2, 100, replace = TRUE))
+> W <- mvrnorm(n = 200, mu = rep(0, 3), Sigma = diag(3))
+> X <- mvrnorm(n = 200, mu = rep(2, 3), Sigma = diag(3))
+> colnames(W) <- c("w1", "w2", "w3")
+> colnames(X) <- c("x1", "x2", "x3")
+> DATA <- data.frame(y, W, X)
+>
+> pFUN <- list(list(formula = w1~x1+x2, model = lm),
++ list(model = rpart))
+>
+> RES <- inbagg(y~w1+w2+w3~x1+x2+x3, data = DATA, pFUN = pFUN)
+> predict(RES, newdata = X)
+ [1] 1 1 2 2 1 2 2 2 2 1 1 1 2 1 2 1 2 2 1 2 2 1 2 1 1 1 1 1 2 1 1 2 1 1 2 2 2
+ [38] 1 2 1 2 2 2 2 2 2 1 1 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 2 1 2 1 2 1 1
+ [75] 1 2 2 1 2 2 1 2 1 1 2 1 2 1 1 1 1 1 2 2 2 2 1 1 2 2 1 1 2 2 1 2 2 2 2 1 1
+[112] 1 2 1 2 1 2 2 1 2 2 1 2 1 1 1 1 1 2 1 1 2 1 1 2 2 2 1 2 1 2 2 2 2 2 2 1 1
+[149] 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 2 1 2 1 2 1 1 1 2 2 1 2 2 1 2 1 1 2
+[186] 1 2 1 1 1 1 1 2 2 2 2 1 1 2 2
+Levels: 1 2
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:rpart’, ‘package:MASS’
+
+> nameEx("predict.inclass")
+> ### * predict.inclass
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: predict.inclass
+> ### Title: Predictions from an Inclass Object
+> ### Aliases: predict.inclass
+> ### Keywords: misc
+>
+> ### ** Examples
+>
+> ## Not run:
+> ##D # Simulation model, classification rule following Hand et al. (2001)
+> ##D
+> ##D theta90 <- varset(N = 1000, sigma = 0.1, theta = 90, threshold = 0)
+> ##D
+> ##D dataset <- as.data.frame(cbind(theta90$explanatory, theta90$intermediate))
+> ##D names(dataset) <- c(colnames(theta90$explanatory),
+> ##D colnames(theta90$intermediate))
+> ##D
+> ##D classify <- function(Y, threshold = 0) {
+> ##D Y <- Y[,c("y1", "y2")]
+> ##D z <- (Y > threshold)
+> ##D resp <- as.factor(ifelse((z[,1] + z[,2]) > 1, 1, 0))
+> ##D return(resp)
+> ##D }
+> ##D
+> ##D formula <- response~y1+y2~x1+x2
+> ##D
+> ##D fit <- inclass(formula, data = dataset, pFUN = list(list(model = lm)),
+> ##D cFUN = classify)
+> ##D
+> ##D predict(object = fit, newdata = dataset)
+> ##D
+> ##D
+> ##D data("Smoking", package = "ipred")
+> ##D
+> ##D # explanatory variables are: TarY, NicY, COY, Sex, Age
+> ##D # intermediate variables are: TVPS, BPNL, COHB
+> ##D # reponse is defined by:
+> ##D
+> ##D classify <- function(data){
+> ##D data <- data[,c("TVPS", "BPNL", "COHB")]
+> ##D res <- t(t(data) > c(4438, 232.5, 58))
+> ##D res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
+> ##D res
+> ##D }
+> ##D
+> ##D response <- classify(Smoking[ ,c("TVPS", "BPNL", "COHB")])
+> ##D smoking <- cbind(Smoking, response)
+> ##D
+> ##D formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age
+> ##D
+> ##D fit <- inclass(formula, data = smoking,
+> ##D pFUN = list(list(model = lm)), cFUN = classify)
+> ##D
+> ##D
+> ##D predict(object = fit, newdata = smoking)
+> ## End(Not run)
+>
+> data("GlaucomaMVF", package = "ipred")
+> library("rpart")
+> glaucoma <- GlaucomaMVF[,(names(GlaucomaMVF) != "tension")]
+> # explanatory variables are derived by laser scanning image and intra occular pressure
+> # intermediate variables are: clv, cs, lora
+> # response is defined by
+>
+> classify <- function (data) {
++ attach(data)
++ res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >=
++ 49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) &
++ clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) &
++ !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) |
++ (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
++ detach(data)
++ factor (res, labels = c("glaucoma", "normal"))
++ }
+>
+> fit <- inclass(Class~clv+lora+cs~., data = glaucoma,
++ pFUN = list(list(model = rpart)), cFUN = classify)
+>
+> data("GlaucomaM", package = "TH.data")
+> predict(object = fit, newdata = GlaucomaM)
+ [1] normal normal normal normal glaucoma glaucoma normal normal
+ [9] normal normal normal normal glaucoma normal normal normal
+ [17] normal normal normal glaucoma normal normal glaucoma normal
+ [25] normal normal glaucoma normal glaucoma normal normal normal
+ [33] normal normal normal normal glaucoma normal normal normal
+ [41] normal normal glaucoma normal normal glaucoma normal normal
+ [49] normal normal normal glaucoma glaucoma glaucoma glaucoma normal
+ [57] glaucoma glaucoma normal normal glaucoma normal glaucoma normal
+ [65] glaucoma normal normal normal normal normal glaucoma glaucoma
+ [73] glaucoma normal normal normal glaucoma normal normal normal
+ [81] glaucoma normal normal normal normal glaucoma glaucoma glaucoma
+ [89] glaucoma normal normal normal glaucoma normal normal normal
+ [97] normal normal glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[105] normal glaucoma normal glaucoma glaucoma glaucoma glaucoma glaucoma
+[113] glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[121] glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[129] glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[137] glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[145] glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[153] glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[161] glaucoma glaucoma glaucoma normal glaucoma glaucoma glaucoma glaucoma
+[169] glaucoma glaucoma glaucoma glaucoma normal glaucoma glaucoma glaucoma
+[177] glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma glaucoma
+[185] glaucoma glaucoma normal glaucoma glaucoma glaucoma glaucoma glaucoma
+[193] glaucoma glaucoma glaucoma glaucoma
+Levels: glaucoma normal
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:rpart’
+
+> nameEx("prune.bagging")
+> ### * prune.bagging
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: prune.classbagg
+> ### Title: Pruning for Bagging
+> ### Aliases: prune.classbagg prune.regbagg prune.survbagg
+> ### Keywords: tree
+>
+> ### ** Examples
+>
+>
+> data("Glass", package = "mlbench")
+> library("rpart")
+>
+> mod <- bagging(Type ~ ., data=Glass, nbagg=10, coob=TRUE)
+> pmod <- prune(mod)
+> print(pmod)
+
+Bagging classification trees with 10 bootstrap replications
+
+Call: bagging.data.frame(formula = Type ~ ., data = Glass, nbagg = 10,
+ coob = TRUE)
+
+Out-of-bag estimate of misclassification error: 0.285
+
+>
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:rpart’
+
+> nameEx("rsurv")
+> ### * rsurv
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: rsurv
+> ### Title: Simulate Survival Data
+> ### Aliases: rsurv
+> ### Keywords: survival
+>
+> ### ** Examples
+>
+>
+> library("survival")
+> # 3*X1 + X2
+> simdat <- rsurv(500, model="C")
+> coxph(Surv(time, cens) ~ ., data=simdat)
+Call:
+coxph(formula = Surv(time, cens) ~ ., data = simdat)
+
+ coef exp(coef) se(coef) z p
+X1 3.1555 23.4648 0.2023 15.60 < 2e-16
+X2 1.1015 3.0086 0.1628 6.77 1.3e-11
+X3 -0.2103 0.8104 0.1525 -1.38 0.168
+X4 0.0466 1.0477 0.1488 0.31 0.754
+X5 0.2709 1.3111 0.1536 1.76 0.078
+
+Likelihood ratio test=289 on 5 df, p=0
+n= 500, number of events= 500
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:survival’
+
+> nameEx("sbrier")
+> ### * sbrier
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: sbrier
+> ### Title: Model Fit for Survival Data
+> ### Aliases: sbrier
+> ### Keywords: survival
+>
+> ### ** Examples
+>
+>
+> library("survival")
+> data("DLBCL", package = "ipred")
+> smod <- Surv(DLBCL$time, DLBCL$cens)
+>
+> KM <- survfit(smod ~ 1)
+> # integrated Brier score up to max(DLBCL$time)
+> sbrier(smod, KM)
+ [,1]
+[1,] 0.2237226
+attr(,"names")
+[1] "integrated Brier score"
+attr(,"time")
+[1] 1.3 129.9
+>
+> # integrated Brier score up to time=50
+> sbrier(smod, KM, btime=c(0, 50))
+Warning in sbrier(smod, KM, btime = c(0, 50)) :
+ btime[1] is smaller than min(time)
+ [,1]
+[1,] 0.2174081
+attr(,"names")
+[1] "integrated Brier score"
+attr(,"time")
+[1] 1.3 39.6
+>
+> # Brier score for time=50
+> sbrier(smod, KM, btime=50)
+Brier score
+ 0.249375
+attr(,"time")
+[1] 50
+>
+> # a "real" model: one single survival tree with Intern. Prognostic Index
+> # and mean gene expression in the first cluster as predictors
+> mod <- bagging(Surv(time, cens) ~ MGEc.1 + IPI, data=DLBCL, nbagg=1)
+>
+> # this is a list of survfit objects (==KM-curves), one for each observation
+> # in DLBCL
+> pred <- predict(mod, newdata=DLBCL)
+>
+> # integrated Brier score up to max(time)
+> sbrier(smod, pred)
+ [,1]
+[1,] 0.1442559
+attr(,"names")
+[1] "integrated Brier score"
+attr(,"time")
+[1] 1.3 129.9
+>
+> # Brier score at time=50
+> sbrier(smod, pred, btime=50)
+Brier score
+ 0.1774478
+attr(,"time")
+[1] 50
+> # artificial examples and illustrations
+>
+> cleans <- function(x) { attr(x, "time") <- NULL; names(x) <- NULL; x }
+>
+> n <- 100
+> time <- rpois(n, 20)
+> cens <- rep(1, n)
+>
+> # checks, Graf et al. page 2536, no censoring at all!
+> # no information: \pi(t) = 0.5
+>
+> a <- sbrier(Surv(time, cens), rep(0.5, n), time[50])
+> stopifnot(all.equal(cleans(a),0.25))
+>
+> # some information: \pi(t) = S(t)
+>
+> n <- 100
+> time <- 1:100
+> mod <- survfit(Surv(time, cens) ~ 1)
+> a <- sbrier(Surv(time, cens), rep(list(mod), n))
+> mymin <- mod$surv * (1 - mod$surv)
+> cleans(a)
+ [,1]
+[1,] 0.1682833
+> sum(mymin)/diff(range(time))
+[1] 0.1683333
+>
+> # independent of ordering
+> rand <- sample(1:100)
+> b <- sbrier(Surv(time, cens)[rand], rep(list(mod), n)[rand])
+> stopifnot(all.equal(cleans(a), cleans(b)))
+>
+> ## Don't show:
+> # total information: \pi(t | X) known for every obs
+>
+> time <- 1:10
+> cens <- rep(1,10)
+> pred <- diag(10)
+> pred[upper.tri(pred)] <- 1
+> diag(pred) <- 0
+> # <FIXME>
+> # a <- sbrier(Surv(time, cens), pred)
+> # stopifnot(all.equal(a, 0))
+> # </FIXME>
+> ## End(Don't show)
+>
+> # 2 groups at different risk
+>
+> time <- c(1:10, 21:30)
+> strata <- c(rep(1, 10), rep(2, 10))
+> cens <- rep(1, length(time))
+>
+> # no information about the groups
+>
+> a <- sbrier(Surv(time, cens), survfit(Surv(time, cens) ~ 1))
+> b <- sbrier(Surv(time, cens), rep(list(survfit(Surv(time, cens) ~1)), 20))
+> stopifnot(all.equal(a, b))
+>
+> # risk groups known
+>
+> mod <- survfit(Surv(time, cens) ~ strata)
+> b <- sbrier(Surv(time, cens), c(rep(list(mod[1]), 10), rep(list(mod[2]), 10)))
+> stopifnot(a > b)
+>
+> ### GBSG2 data
+> data("GBSG2", package = "TH.data")
+>
+> thsum <- function(x) {
++ ret <- c(median(x), quantile(x, 0.25), quantile(x,0.75))
++ names(ret)[1] <- "Median"
++ ret
++ }
+>
+> t(apply(GBSG2[,c("age", "tsize", "pnodes",
++ "progrec", "estrec")], 2, thsum))
+ Median 25% 75%
+age 53.0 46 61.00
+tsize 25.0 20 35.00
+pnodes 3.0 1 7.00
+progrec 32.5 7 131.75
+estrec 36.0 8 114.00
+>
+> table(GBSG2$menostat)
+
+ Pre Post
+ 290 396
+> table(GBSG2$tgrade)
+
+ I II III
+ 81 444 161
+> table(GBSG2$horTh)
+
+ no yes
+440 246
+>
+> # pooled Kaplan-Meier
+>
+> mod <- survfit(Surv(time, cens) ~ 1, data=GBSG2)
+> # integrated Brier score
+> sbrier(Surv(GBSG2$time, GBSG2$cens), mod)
+ [,1]
+[1,] 0.1939366
+attr(,"names")
+[1] "integrated Brier score"
+attr(,"time")
+[1] 8 2659
+> # Brier score at 5 years
+> sbrier(Surv(GBSG2$time, GBSG2$cens), mod, btime=1825)
+Brier score
+ 0.2499984
+attr(,"time")
+[1] 1825
+>
+> # Nottingham prognostic index
+>
+> GBSG2 <- GBSG2[order(GBSG2$time),]
+>
+> NPI <- 0.2*GBSG2$tsize/10 + 1 + as.integer(GBSG2$tgrade)
+> NPI[NPI < 3.4] <- 1
+> NPI[NPI >= 3.4 & NPI <=5.4] <- 2
+> NPI[NPI > 5.4] <- 3
+>
+> mod <- survfit(Surv(time, cens) ~ NPI, data=GBSG2)
+> plot(mod)
+>
+> pred <- c()
+> survs <- c()
+> for (i in sort(unique(NPI)))
++ survs <- c(survs, getsurv(mod[i], 1825))
+>
+> for (i in 1:nrow(GBSG2))
++ pred <- c(pred, survs[NPI[i]])
+>
+> # Brier score of NPI at t=5 years
+> sbrier(Surv(GBSG2$time, GBSG2$cens), pred, btime=1825)
+Brier score
+ 0.233823
+attr(,"time")
+[1] 1825
+>
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:survival’
+
+> nameEx("slda")
+> ### * slda
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: slda
+> ### Title: Stabilised Linear Discriminant Analysis
+> ### Aliases: slda slda.default slda.formula slda.factor
+> ### Keywords: multivariate
+>
+> ### ** Examples
+>
+>
+> library("mlbench")
+> library("MASS")
+> learn <- as.data.frame(mlbench.twonorm(100))
+> test <- as.data.frame(mlbench.twonorm(1000))
+>
+> mlda <- lda(classes ~ ., data=learn)
+> mslda <- slda(classes ~ ., data=learn)
+>
+> print(mean(predict(mlda, newdata=test)$class != test$classes))
+[1] 0.047
+> print(mean(predict(mslda, newdata=test)$class != test$classes))
+[1] 0.025
+>
+>
+>
+>
+> cleanEx()
+
+detaching ‘package:MASS’, ‘package:mlbench’
+
+> nameEx("varset")
+> ### * varset
+>
+> flush(stderr()); flush(stdout())
+>
+> ### Name: varset
+> ### Title: Simulation Model
+> ### Aliases: varset
+> ### Keywords: misc
+>
+> ### ** Examples
+>
+>
+> theta90 <- varset(N = 1000, sigma = 0.1, theta = 90, threshold = 0)
+> theta0 <- varset(N = 1000, sigma = 0.1, theta = 0, threshold = 0)
+> par(mfrow = c(1, 2))
+> plot(theta0$intermediate)
+> plot(theta90$intermediate)
+>
+>
+>
+>
+> graphics::par(get("par.postscript", pos = 'CheckExEnv'))
+> ### * <FOOTER>
+> ###
+> options(digits = 7L)
+> base::cat("Time elapsed: ", proc.time() - base::get("ptime", pos = 'CheckExEnv'),"\n")
+Time elapsed: 4.548 0.04 4.606 0 0
+> grDevices::dev.off()
+null device
+ 1
+> ###
+> ### Local variables: ***
+> ### mode: outline-minor ***
+> ### outline-regexp: "\\(> \\)?### [*]+" ***
+> ### End: ***
+> quit('no')
diff --git a/tests/ipred-bugs.R b/tests/ipred-bugs.R
new file mode 100644
index 0000000..ad4b150
--- /dev/null
+++ b/tests/ipred-bugs.R
@@ -0,0 +1,51 @@
+library(ipred)
+
+
+actversion <- paste(R.version$major, R.version$minor, sep=".")
+thisversion <- "1.7.0"
+
+#if (compareVersion(actversion, thisversion) >= 0) {
+# RNGversion("1.6.2")
+#}
+set.seed(29081975)
+
+data("BreastCancer", package = "mlbench")
+mod <- bagging(Class ~ Cl.thickness + Cell.size
+ + Cell.shape + Marg.adhesion
+ + Epith.c.size + Bare.nuclei
+ + Bl.cromatin + Normal.nucleoli
+ + Mitoses, data=BreastCancer, coob=TRUE)
+print(mod)
+
+print(a <- predict(mod, newdata=BreastCancer))
+stopifnot(length(a) == nrow(BreastCancer))
+
+# bagging failed if only one predictor was specified
+# by Christoph M. Friedrich <chris at uni-wh.de>, April 29th, 2002
+
+X <- as.data.frame(matrix(rnorm(1000), ncol=10))
+y <- factor(ifelse(apply(X, 1, mean) > 0, 1, 0))
+learn <- cbind(y, X)
+mt <- bagging(y ~ V1, data=learn, coob=TRUE)
+# <FIXME>
+# This won't work because of some difficulties with predict.lda
+# mt <- bagging(y ~ V1, data=learn, method="double", coob=FALSE)
+# </FIXME>
+X <- as.data.frame(matrix(rnorm(1000), ncol=10))
+y <- apply(X, 1, mean) + rnorm(nrow(X))
+learn <- cbind(y, X)
+mt <- bagging(y ~ V1, data=learn, coob=TRUE)
+
+# cv.numeric and bootest.numeric were broken, check for reasonaly values
+X <- as.data.frame(matrix(rnorm(1000), ncol=10))
+y <- apply(X, 1, mean) + rnorm(nrow(X))
+learn <- cbind(y, X)
+newy <- apply(X, 1, mean) + rnorm(nrow(X))
+mod <- lm(y ~ ., data=learn)
+trueerr <- sqrt(mean((newy - fitted(mod))^2))
+cverr <- rep(0,5)
+for (i in 1:5) cverr[i] <- errorest(y ~., data=learn, model=lm)$error
+booterr <- errorest(y ~., data=learn, model=lm,
+ estimator="boot",est.para=control.errorest(nboot=50))$error
+print(trueerr/mean(cverr))
+print(trueerr/booterr)
diff --git a/tests/ipred-bugs.Rout.save b/tests/ipred-bugs.Rout.save
new file mode 100644
index 0000000..0a2f710
--- /dev/null
+++ b/tests/ipred-bugs.Rout.save
@@ -0,0 +1,185 @@
+
+R Under development (unstable) (2015-02-20 r67856) -- "Unsuffered Consequences"
+Copyright (C) 2015 The R Foundation for Statistical Computing
+Platform: x86_64-unknown-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> library(ipred)
+>
+>
+> actversion <- paste(R.version$major, R.version$minor, sep=".")
+> thisversion <- "1.7.0"
+>
+> #if (compareVersion(actversion, thisversion) >= 0) {
+> # RNGversion("1.6.2")
+> #}
+> set.seed(29081975)
+>
+> data("BreastCancer", package = "mlbench")
+> mod <- bagging(Class ~ Cl.thickness + Cell.size
++ + Cell.shape + Marg.adhesion
++ + Epith.c.size + Bare.nuclei
++ + Bl.cromatin + Normal.nucleoli
++ + Mitoses, data=BreastCancer, coob=TRUE)
+> print(mod)
+
+Bagging classification trees with 25 bootstrap replications
+
+Call: bagging.data.frame(formula = Class ~ Cl.thickness + Cell.size +
+ Cell.shape + Marg.adhesion + Epith.c.size + Bare.nuclei +
+ Bl.cromatin + Normal.nucleoli + Mitoses, data = BreastCancer,
+ coob = TRUE)
+
+Out-of-bag estimate of misclassification error: 0.0439
+
+>
+> print(a <- predict(mod, newdata=BreastCancer))
+ [1] benign benign benign malignant benign malignant benign
+ [8] benign benign benign benign benign malignant benign
+ [15] malignant malignant benign benign malignant benign malignant
+ [22] malignant benign malignant benign malignant benign benign
+ [29] benign benign benign benign malignant benign benign
+ [36] benign malignant benign malignant malignant malignant malignant
+ [43] malignant malignant malignant benign malignant benign benign
+ [50] malignant malignant malignant malignant malignant malignant malignant
+ [57] malignant malignant malignant malignant malignant benign malignant
+ [64] malignant benign malignant benign malignant malignant benign
+ [71] benign malignant benign malignant malignant benign benign
+ [78] benign benign benign benign benign benign benign
+ [85] malignant malignant malignant malignant benign benign benign
+ [92] benign benign benign benign benign benign benign
+ [99] malignant malignant malignant malignant benign malignant malignant
+[106] malignant malignant malignant benign malignant benign malignant
+[113] malignant malignant benign benign benign malignant benign
+[120] benign benign benign malignant malignant malignant benign
+[127] malignant benign malignant benign benign benign malignant
+[134] benign benign benign benign benign benign benign
+[141] benign benign malignant benign benign benign malignant
+[148] benign benign malignant benign malignant malignant benign
+[155] benign malignant benign benign benign malignant malignant
+[162] benign benign benign benign benign malignant malignant
+[169] benign benign benign benign benign malignant malignant
+[176] malignant benign malignant benign malignant benign benign
+[183] benign malignant malignant benign malignant malignant malignant
+[190] benign malignant malignant benign benign benign benign
+[197] benign benign benign benign malignant malignant benign
+[204] benign benign malignant malignant benign benign benign
+[211] malignant malignant benign malignant malignant malignant benign
+[218] benign malignant benign benign malignant malignant malignant
+[225] malignant benign malignant malignant benign malignant malignant
+[232] malignant benign malignant benign benign malignant malignant
+[239] malignant malignant benign benign benign benign benign
+[246] benign malignant malignant benign benign benign malignant
+[253] benign malignant malignant malignant benign benign benign
+[260] benign malignant malignant malignant malignant malignant benign
+[267] malignant malignant malignant benign malignant benign malignant
+[274] malignant benign benign benign benign benign malignant
+[281] benign benign malignant malignant malignant malignant malignant
+[288] benign malignant malignant benign benign malignant malignant
+[295] benign malignant benign benign benign malignant malignant
+[302] benign malignant benign malignant malignant benign benign
+[309] malignant benign benign benign malignant benign benign
+[316] malignant malignant malignant benign benign malignant benign
+[323] benign malignant benign benign malignant benign malignant
+[330] malignant malignant benign benign malignant malignant benign
+[337] malignant benign benign malignant malignant benign benign
+[344] benign malignant benign benign benign malignant malignant
+[351] benign benign benign malignant benign benign malignant
+[358] malignant malignant malignant malignant malignant benign benign
+[365] benign benign malignant malignant benign benign benign
+[372] benign benign benign benign benign benign benign
+[379] benign benign benign malignant benign benign benign
+[386] benign malignant benign benign benign benign malignant
+[393] benign benign benign benign benign benign benign
+[400] benign malignant benign benign benign benign benign
+[407] benign benign benign benign benign benign malignant
+[414] benign malignant benign malignant benign benign benign
+[421] benign malignant benign benign benign malignant benign
+[428] malignant benign benign benign benign benign benign
+[435] benign malignant malignant benign benign benign malignant
+[442] benign benign benign benign benign benign benign
+[449] benign malignant benign benign benign malignant benign
+[456] malignant malignant malignant benign benign benign benign
+[463] benign benign benign malignant malignant malignant benign
+[470] benign benign benign benign benign benign benign
+[477] benign benign benign malignant benign benign malignant
+[484] malignant benign benign benign malignant malignant malignant
+[491] benign malignant benign malignant benign benign benign
+[498] benign benign benign benign benign benign benign
+[505] benign benign malignant benign benign benign benign
+[512] benign benign benign malignant malignant benign benign
+[519] benign malignant benign benign malignant malignant benign
+[526] benign benign benign benign benign malignant benign
+[533] benign benign benign benign benign benign benign
+[540] benign benign benign benign benign benign benign
+[547] malignant benign benign malignant benign benign benign
+[554] benign benign benign benign benign benign benign
+[561] benign benign benign benign benign malignant benign
+[568] benign malignant malignant malignant malignant benign benign
+[575] malignant benign benign benign benign benign benign
+[582] malignant malignant benign benign benign malignant benign
+[589] malignant benign malignant malignant malignant benign malignant
+[596] benign benign benign benign benign benign benign
+[603] benign malignant malignant malignant benign benign malignant
+[610] benign malignant malignant malignant benign benign benign
+[617] benign benign benign benign benign benign benign
+[624] benign benign benign malignant benign benign benign
+[631] benign benign benign malignant benign benign malignant
+[638] benign benign benign benign benign benign benign
+[645] benign benign benign benign malignant benign benign
+[652] benign benign benign benign benign benign benign
+[659] malignant benign benign benign benign benign benign
+[666] benign benign benign malignant malignant malignant benign
+[673] benign benign benign benign benign benign benign
+[680] benign malignant malignant benign benign benign benign
+[687] benign benign benign benign benign malignant benign
+[694] benign benign benign malignant malignant malignant
+Levels: benign malignant
+> stopifnot(length(a) == nrow(BreastCancer))
+>
+> # bagging failed if only one predictor was specified
+> # by Christoph M. Friedrich <chris at uni-wh.de>, April 29th, 2002
+>
+> X <- as.data.frame(matrix(rnorm(1000), ncol=10))
+> y <- factor(ifelse(apply(X, 1, mean) > 0, 1, 0))
+> learn <- cbind(y, X)
+> mt <- bagging(y ~ V1, data=learn, coob=TRUE)
+> # <FIXME>
+> # This won't work because of some difficulties with predict.lda
+> # mt <- bagging(y ~ V1, data=learn, method="double", coob=FALSE)
+> # </FIXME>
+> X <- as.data.frame(matrix(rnorm(1000), ncol=10))
+> y <- apply(X, 1, mean) + rnorm(nrow(X))
+> learn <- cbind(y, X)
+> mt <- bagging(y ~ V1, data=learn, coob=TRUE)
+>
+> # cv.numeric and bootest.numeric were broken, check for reasonaly values
+> X <- as.data.frame(matrix(rnorm(1000), ncol=10))
+> y <- apply(X, 1, mean) + rnorm(nrow(X))
+> learn <- cbind(y, X)
+> newy <- apply(X, 1, mean) + rnorm(nrow(X))
+> mod <- lm(y ~ ., data=learn)
+> trueerr <- sqrt(mean((newy - fitted(mod))^2))
+> cverr <- rep(0,5)
+> for (i in 1:5) cverr[i] <- errorest(y ~., data=learn, model=lm)$error
+> booterr <- errorest(y ~., data=learn, model=lm,
++ estimator="boot",est.para=control.errorest(nboot=50))$error
+> print(trueerr/mean(cverr))
+[1] 0.9612632
+> print(trueerr/booterr)
+[1] 0.9073771
+>
+> proc.time()
+ user system elapsed
+ 1.660 0.076 1.723
diff --git a/tests/ipred-segfault.R b/tests/ipred-segfault.R
new file mode 100644
index 0000000..c6298e6
--- /dev/null
+++ b/tests/ipred-segfault.R
@@ -0,0 +1,136 @@
+library("ipred")
+library("mlbench")
+library("MASS")
+library("survival")
+
+actversion <- paste(R.version$major, R.version$minor, sep=".")
+thisversion <- "1.7.0"
+
+#if (compareVersion(actversion, thisversion) >= 0) {
+# RNGversion("1.6.2")
+#}
+set.seed(29081975)
+
+
+# Classification
+
+learn <- as.data.frame(mlbench.twonorm(200))
+test <- as.data.frame(mlbench.twonorm(100))
+
+# bagging
+
+mod <- bagging(classes ~ ., data=learn, coob=TRUE, nbagg=10)
+mod
+predict(mod)[1:10]
+
+# Double-Bagging
+
+comb.lda <- list(list(model=lda, predict=function(obj, newdata)
+ predict(obj, newdata)$x))
+
+mod <- bagging(classes ~ ., data=learn, comb=comb.lda, nbagg=10)
+mod
+predict(mod, newdata=test[1:10,])
+predict(mod, newdata=test[1:10,], agg="aver")
+predict(mod, newdata=test[1:10,], agg="wei")
+predict(mod, newdata=test[1:10,], type="prob")
+predict(mod, newdata=test[1:10,], type="prob", agg="aver")
+predict(mod, newdata=test[1:10,], type="prob", agg="wei")
+
+mypredict.lda <- function(object, newdata)
+ predict(object, newdata = newdata)$class
+
+errorest(classes ~ ., data=learn, model=lda, predict=mypredict.lda)
+errorest(classes ~ ., data=learn, model=lda, predict=mypredict.lda,
+ est.para=control.errorest(k=5, random=FALSE))
+
+lapply(errorest(classes ~ ., data=learn, model=lda, predict=mypredict.lda,
+ est.para=control.errorest(k=5, random=FALSE, getmodels=TRUE))$models, class)
+errorest(classes ~ ., data=learn, model=bagging,
+ est.para=control.errorest(k=2), nbagg=10)
+errorest(classes ~ ., data=learn, model=bagging,
+ est.para=control.errorest(k=2), nbagg=10, comb=comb.lda)
+errorest(classes ~ ., data=learn, model=lda,
+predict=mypredict.lda, estimator="boot")
+errorest(classes ~ ., data=learn, model=lda,
+predict=mypredict.lda, estimator="632plus")
+
+# Regression
+
+learn <- as.data.frame(mlbench.friedman1(100))
+test <- as.data.frame(mlbench.friedman1(100))
+
+# bagging
+
+mod <- bagging(y ~ ., data=learn, coob=TRUE, nbagg=10)
+mod
+predict(mod)[1:10]
+
+predict(mod, newdata=test[1:10,])
+predict(mod, newdata=test[1:10,], agg="aver")
+predict(mod, newdata=test[1:10,], agg="wei")
+errorest(y ~ ., data=learn, model=lm)
+errorest(y ~ ., data=learn, model=lm,
+ est.para=control.errorest(k=5, random=FALSE))
+lapply(errorest(y ~ ., data=learn, model=lm,
+ est.para=control.errorest(k=5, random=FALSE, getmodels=TRUE))$models, class)
+errorest(y ~ ., data=learn, model=lm, estimator="boot")
+
+# survival
+
+learn <- rsurv(100, model="C")
+test <- rsurv(100, model="C")
+
+mod <- bagging(Surv(time, cens) ~ ., data=learn, nbagg=10)
+mod
+predict(mod, newdata=test[1:10,])
+
+#errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+# est.para=list(k=2, random=FALSE), nbagg=5)
+#errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+# estimator="boot", nbagg=5, est.para=list(nboot=5))
+#insert control.errorest
+errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+ est.para=control.errorest(k=2, random=FALSE), nbagg=5)
+errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+ estimator="boot", nbagg=5, est.para=control.errorest(nboot=5))
+
+#lapply(errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+# estimator="cv", nbagg=1, est.para=list(k=2, random=FALSE,
+# getmodels=TRUE))$models, class)
+#insert control.errorest
+lapply(errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+ estimator="cv", nbagg=1, est.para=control.errorest(k=2, random=FALSE,
+ getmodels=TRUE))$models, class)
+
+# bundling for regression
+
+learn <- as.data.frame(mlbench.friedman1(100))
+test <- as.data.frame(mlbench.friedman1(100))
+
+comb <- list(list(model=lm, predict=predict.lm))
+
+modc <- bagging(y ~ ., data=learn, nbagg=10, comb=comb)
+modc
+predict(modc, newdata=learn)[1:10]
+
+# bundling for survival
+
+while(FALSE) {
+data("GBSG2", package = "ipred")
+rcomb <- list(list(model=coxph, predict=predict))
+
+mods <- bagging(Surv(time,cens) ~ ., data=GBSG2, nbagg=10,
+ comb=rcomb, control=rpart.control(xval=0))
+predict(mods, newdata=GBSG2[1:3,])
+
+# test for method dispatch on integer valued responses
+y <- sample(1:100, 100)
+class(y)
+x <- matrix(rnorm(100*5), ncol=5)
+mydata <- as.data.frame(cbind(y, x))
+
+cv(y, y ~ ., data=mydata, model=lm, predict=predict)
+bootest(y, y ~ ., data=mydata, model=lm, predict=predict)
+bagging(y ~., data=mydata, nbagg=10)
+}
diff --git a/tests/ipred-segfault.Rout.save b/tests/ipred-segfault.Rout.save
new file mode 100644
index 0000000..7b3b2b4
--- /dev/null
+++ b/tests/ipred-segfault.Rout.save
@@ -0,0 +1,447 @@
+
+R Under development (unstable) (2015-07-25 r68745) -- "Unsuffered Consequences"
+Copyright (C) 2015 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> library("ipred")
+> library("mlbench")
+> library("MASS")
+> library("survival")
+>
+> actversion <- paste(R.version$major, R.version$minor, sep=".")
+> thisversion <- "1.7.0"
+>
+> #if (compareVersion(actversion, thisversion) >= 0) {
+> # RNGversion("1.6.2")
+> #}
+> set.seed(29081975)
+>
+>
+> # Classification
+>
+> learn <- as.data.frame(mlbench.twonorm(200))
+> test <- as.data.frame(mlbench.twonorm(100))
+>
+> # bagging
+>
+> mod <- bagging(classes ~ ., data=learn, coob=TRUE, nbagg=10)
+> mod
+
+Bagging classification trees with 10 bootstrap replications
+
+Call: bagging.data.frame(formula = classes ~ ., data = learn, coob = TRUE,
+ nbagg = 10)
+
+Out-of-bag estimate of misclassification error: 0.195
+
+> predict(mod)[1:10]
+ [1] 1 1 1 2 2 2 1 1 1 1
+Levels: 1 2
+>
+> # Double-Bagging
+>
+> comb.lda <- list(list(model=lda, predict=function(obj, newdata)
++ predict(obj, newdata)$x))
+>
+> mod <- bagging(classes ~ ., data=learn, comb=comb.lda, nbagg=10)
+> mod
+
+Bagging classification trees with 10 bootstrap replications
+
+Call: bagging.data.frame(formula = classes ~ ., data = learn, comb = comb.lda,
+ nbagg = 10)
+
+
+> predict(mod, newdata=test[1:10,])
+ [1] 1 1 1 1 1 2 1 2 1 2
+Levels: 1 2
+> predict(mod, newdata=test[1:10,], agg="aver")
+ [1] 1 1 1 1 2 2 1 2 1 2
+Levels: 1 2
+> predict(mod, newdata=test[1:10,], agg="wei")
+ [1] 1 1 1 1 2 2 1 2 1 2
+Levels: 1 2
+> predict(mod, newdata=test[1:10,], type="prob")
+ 1 2
+ [1,] 1.0 0.0
+ [2,] 0.9 0.1
+ [3,] 1.0 0.0
+ [4,] 1.0 0.0
+ [5,] 0.5 0.5
+ [6,] 0.0 1.0
+ [7,] 1.0 0.0
+ [8,] 0.1 0.9
+ [9,] 1.0 0.0
+[10,] 0.0 1.0
+> predict(mod, newdata=test[1:10,], type="prob", agg="aver")
+ 1 2
+ [1,] 1.0 0.0
+ [2,] 0.9 0.1
+ [3,] 1.0 0.0
+ [4,] 1.0 0.0
+ [5,] 0.5 0.5
+ [6,] 0.0 1.0
+ [7,] 1.0 0.0
+ [8,] 0.1 0.9
+ [9,] 1.0 0.0
+[10,] 0.0 1.0
+> predict(mod, newdata=test[1:10,], type="prob", agg="wei")
+ 1 2
+ [1,] 1.000000000 0.000000000
+ [2,] 0.996441281 0.003558719
+ [3,] 1.000000000 0.000000000
+ [4,] 1.000000000 0.000000000
+ [5,] 0.484359233 0.515640767
+ [6,] 0.000000000 1.000000000
+ [7,] 1.000000000 0.000000000
+ [8,] 0.001138952 0.998861048
+ [9,] 1.000000000 0.000000000
+[10,] 0.000000000 1.000000000
+>
+> mypredict.lda <- function(object, newdata)
++ predict(object, newdata = newdata)$class
+>
+> errorest(classes ~ ., data=learn, model=lda, predict=mypredict.lda)
+
+Call:
+errorest.data.frame(formula = classes ~ ., data = learn, model = lda,
+ predict = mypredict.lda)
+
+ 10-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.035
+
+> errorest(classes ~ ., data=learn, model=lda, predict=mypredict.lda,
++ est.para=control.errorest(k=5, random=FALSE))
+
+Call:
+errorest.data.frame(formula = classes ~ ., data = learn, model = lda,
+ predict = mypredict.lda, est.para = control.errorest(k = 5,
+ random = FALSE))
+
+ 5-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.04
+
+>
+> lapply(errorest(classes ~ ., data=learn, model=lda, predict=mypredict.lda,
++ est.para=control.errorest(k=5, random=FALSE, getmodels=TRUE))$models, class)
+[[1]]
+[1] "lda"
+
+[[2]]
+[1] "lda"
+
+[[3]]
+[1] "lda"
+
+[[4]]
+[1] "lda"
+
+[[5]]
+[1] "lda"
+
+> errorest(classes ~ ., data=learn, model=bagging,
++ est.para=control.errorest(k=2), nbagg=10)
+
+Call:
+errorest.data.frame(formula = classes ~ ., data = learn, model = bagging,
+ est.para = control.errorest(k = 2), nbagg = 10)
+
+ 2-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.12
+
+> errorest(classes ~ ., data=learn, model=bagging,
++ est.para=control.errorest(k=2), nbagg=10, comb=comb.lda)
+
+Call:
+errorest.data.frame(formula = classes ~ ., data = learn, model = bagging,
+ est.para = control.errorest(k = 2), nbagg = 10, comb = comb.lda)
+
+ 2-fold cross-validation estimator of misclassification error
+
+Misclassification error: 0.055
+
+> errorest(classes ~ ., data=learn, model=lda,
++ predict=mypredict.lda, estimator="boot")
+
+Call:
+errorest.data.frame(formula = classes ~ ., data = learn, model = lda,
+ predict = mypredict.lda, estimator = "boot")
+
+ Bootstrap estimator of misclassification error
+ with 25 bootstrap replications
+
+Misclassification error: 0.038
+Standard deviation: 0.0023
+
+> errorest(classes ~ ., data=learn, model=lda,
++ predict=mypredict.lda, estimator="632plus")
+
+Call:
+errorest.data.frame(formula = classes ~ ., data = learn, model = lda,
+ predict = mypredict.lda, estimator = "632plus")
+
+ .632+ Bootstrap estimator of misclassification error
+ with 25 bootstrap replications
+
+Misclassification error: 0.0336
+
+>
+> # Regression
+>
+> learn <- as.data.frame(mlbench.friedman1(100))
+> test <- as.data.frame(mlbench.friedman1(100))
+>
+> # bagging
+>
+> mod <- bagging(y ~ ., data=learn, coob=TRUE, nbagg=10)
+> mod
+
+Bagging regression trees with 10 bootstrap replications
+
+Call: bagging.data.frame(formula = y ~ ., data = learn, coob = TRUE,
+ nbagg = 10)
+
+Out-of-bag estimate of root mean squared error: 3.0773
+
+> predict(mod)[1:10]
+ [1] 13.367299 15.465575 10.728200 21.426540 16.379882 16.992572 NA
+ [8] 12.899667 8.096002 16.070858
+>
+> predict(mod, newdata=test[1:10,])
+ [1] 13.90686 14.94293 12.94732 11.67369 16.63664 14.42252 16.01717 12.34594
+ [9] 11.96764 22.14124
+> predict(mod, newdata=test[1:10,], agg="aver")
+ [1] 13.90686 14.94293 12.94732 11.67369 16.63664 14.42252 16.01717 12.34594
+ [9] 11.96764 22.14124
+> predict(mod, newdata=test[1:10,], agg="wei")
+ [1] 13.96527 14.95040 12.92685 11.61045 16.74963 14.59937 16.46226 12.44494
+ [9] 12.20556 22.00779
+> errorest(y ~ ., data=learn, model=lm)
+
+Call:
+errorest.data.frame(formula = y ~ ., data = learn, model = lm)
+
+ 10-fold cross-validation estimator of root mean squared error
+
+Root mean squared error: 2.7385
+
+> errorest(y ~ ., data=learn, model=lm,
++ est.para=control.errorest(k=5, random=FALSE))
+
+Call:
+errorest.data.frame(formula = y ~ ., data = learn, model = lm,
+ est.para = control.errorest(k = 5, random = FALSE))
+
+ 5-fold cross-validation estimator of root mean squared error
+
+Root mean squared error: 2.7941
+
+> lapply(errorest(y ~ ., data=learn, model=lm,
++ est.para=control.errorest(k=5, random=FALSE, getmodels=TRUE))$models, class)
+[[1]]
+[1] "lm"
+
+[[2]]
+[1] "lm"
+
+[[3]]
+[1] "lm"
+
+[[4]]
+[1] "lm"
+
+[[5]]
+[1] "lm"
+
+> errorest(y ~ ., data=learn, model=lm, estimator="boot")
+
+Call:
+errorest.data.frame(formula = y ~ ., data = learn, model = lm,
+ estimator = "boot")
+
+ Bootstrap estimator of root mean squared error
+ with 25 bootstrap replications
+
+Root mean squared error: 2.7966
+
+>
+> # survival
+>
+> learn <- rsurv(100, model="C")
+> test <- rsurv(100, model="C")
+>
+> mod <- bagging(Surv(time, cens) ~ ., data=learn, nbagg=10)
+> mod
+
+Bagging survival trees with 10 bootstrap replications
+
+Call: bagging.data.frame(formula = Surv(time, cens) ~ ., data = learn,
+ nbagg = 10)
+
+
+> predict(mod, newdata=test[1:10,])
+[[1]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+117.0000 117.0000 0.0751 0.0640 0.1227
+
+[[2]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+134.0000 134.0000 0.0280 0.0247 0.0321
+
+[[3]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+142.000 142.000 0.215 0.179 0.241
+
+[[4]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+131.0000 131.0000 0.0216 0.0205 0.0280
+
+[[5]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+145.000 145.000 0.217 0.168 0.251
+
+[[6]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+105.0000 105.0000 0.0476 0.0421 0.0496
+
+[[7]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+125.000 125.000 0.181 0.156 0.218
+
+[[8]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+139.000 139.000 0.179 0.156 0.217
+
+[[9]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+131.000 131.000 0.181 0.179 0.251
+
+[[10]]
+Call: survfit(formula = Surv(agglsample[[j]], aggcens[[j]]) ~ 1)
+
+ n events median 0.95LCL 0.95UCL
+120.0000 120.0000 0.0280 0.0247 0.0395
+
+>
+> #errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+> # est.para=list(k=2, random=FALSE), nbagg=5)
+> #errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+> # estimator="boot", nbagg=5, est.para=list(nboot=5))
+> #insert control.errorest
+> errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
++ est.para=control.errorest(k=2, random=FALSE), nbagg=5)
+
+Call:
+errorest.data.frame(formula = Surv(time, cens) ~ ., data = learn,
+ model = bagging, est.para = control.errorest(k = 2, random = FALSE),
+ nbagg = 5)
+
+ 2-fold cross-validation estimator of Brier's score
+
+Brier's score: 0.0874
+
+> errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
++ estimator="boot", nbagg=5, est.para=control.errorest(nboot=5))
+
+Call:
+errorest.data.frame(formula = Surv(time, cens) ~ ., data = learn,
+ model = bagging, estimator = "boot", est.para = control.errorest(nboot = 5),
+ nbagg = 5)
+
+ Bootstrap estimator of Brier's score
+ with 5 bootstrap replications
+
+Brier's score: 0.0956
+
+>
+> #lapply(errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
+> # estimator="cv", nbagg=1, est.para=list(k=2, random=FALSE,
+> # getmodels=TRUE))$models, class)
+> #insert control.errorest
+> lapply(errorest(Surv(time, cens) ~ ., data=learn, model=bagging,
++ estimator="cv", nbagg=1, est.para=control.errorest(k=2, random=FALSE,
++ getmodels=TRUE))$models, class)
+[[1]]
+[1] "survbagg"
+
+[[2]]
+[1] "survbagg"
+
+>
+> # bundling for regression
+>
+> learn <- as.data.frame(mlbench.friedman1(100))
+> test <- as.data.frame(mlbench.friedman1(100))
+>
+> comb <- list(list(model=lm, predict=predict.lm))
+>
+> modc <- bagging(y ~ ., data=learn, nbagg=10, comb=comb)
+> modc
+
+Bagging regression trees with 10 bootstrap replications
+
+Call: bagging.data.frame(formula = y ~ ., data = learn, nbagg = 10,
+ comb = comb)
+
+
+> predict(modc, newdata=learn)[1:10]
+ [1] 7.192725 20.401036 7.607863 14.909765 15.721930 8.544955 16.968957
+ [8] 16.818052 12.692932 14.218597
+>
+> # bundling for survival
+>
+> while(FALSE) {
++ data("GBSG2", package = "ipred")
++ rcomb <- list(list(model=coxph, predict=predict))
++
++ mods <- bagging(Surv(time,cens) ~ ., data=GBSG2, nbagg=10,
++ comb=rcomb, control=rpart.control(xval=0))
++ predict(mods, newdata=GBSG2[1:3,])
++
++ # test for method dispatch on integer valued responses
++ y <- sample(1:100, 100)
++ class(y)
++ x <- matrix(rnorm(100*5), ncol=5)
++ mydata <- as.data.frame(cbind(y, x))
++
++ cv(y, y ~ ., data=mydata, model=lm, predict=predict)
++ bootest(y, y ~ ., data=mydata, model=lm, predict=predict)
++ bagging(y ~., data=mydata, nbagg=10)
++ }
+>
+> proc.time()
+ user system elapsed
+ 4.900 0.047 4.933
diff --git a/tests/ipred-smalltest.R b/tests/ipred-smalltest.R
new file mode 100644
index 0000000..981fe8a
--- /dev/null
+++ b/tests/ipred-smalltest.R
@@ -0,0 +1,34 @@
+
+library(ipred)
+
+# check if SdiffKM computes
+#
+# int_start^stop (exp(-h*t) - c)^2 dt
+#
+# in the correct way
+
+# low-level interface needed
+myfoo <- function(times, prob, h, window=0.0001) {
+ .Call("SdiffKM", as.double(c(0, times)),
+ as.double(c(prob[1], prob)), as.double(c(h,
+ window)), PACKAGE = "ipred")
+}
+
+# to compare with
+mexp <- function(start, stop, haz, c=0) {
+ foo <- function(t)
+ exp(-2*haz*t)/(-2*haz) - 2*c*exp(-haz*t)/(-haz) + c^2*t
+ foo(stop) - foo(start)
+}
+
+
+times <- seq(from=0.01, to=8, by=0.01)
+
+for (myc in c(0,0.5,0.9)) {
+ for (h in c(1,2,3)) {
+ prob <- rep(myc, length(times))
+ a <- round(mexp(0, max(times), h, c=myc),7)
+ b <- round(myfoo(times, prob, h), 7)
+ stopifnot(all.equal(a,b))
+ }
+}
diff --git a/tests/ipred-smalltest.Rout.save b/tests/ipred-smalltest.Rout.save
new file mode 100644
index 0000000..cd594ec
--- /dev/null
+++ b/tests/ipred-smalltest.Rout.save
@@ -0,0 +1,55 @@
+
+R Under development (unstable) (2015-02-20 r67856) -- "Unsuffered Consequences"
+Copyright (C) 2015 The R Foundation for Statistical Computing
+Platform: x86_64-unknown-linux-gnu (64-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+>
+> library(ipred)
+>
+> # check if SdiffKM computes
+> #
+> # int_start^stop (exp(-h*t) - c)^2 dt
+> #
+> # in the correct way
+>
+> # low-level interface needed
+> myfoo <- function(times, prob, h, window=0.0001) {
++ .Call("SdiffKM", as.double(c(0, times)),
++ as.double(c(prob[1], prob)), as.double(c(h,
++ window)), PACKAGE = "ipred")
++ }
+>
+> # to compare with
+> mexp <- function(start, stop, haz, c=0) {
++ foo <- function(t)
++ exp(-2*haz*t)/(-2*haz) - 2*c*exp(-haz*t)/(-haz) + c^2*t
++ foo(stop) - foo(start)
++ }
+>
+>
+> times <- seq(from=0.01, to=8, by=0.01)
+>
+> for (myc in c(0,0.5,0.9)) {
++ for (h in c(1,2,3)) {
++ prob <- rep(myc, length(times))
++ a <- round(mexp(0, max(times), h, c=myc),7)
++ b <- round(myfoo(times, prob, h), 7)
++ stopifnot(all.equal(a,b))
++ }
++ }
+>
+> proc.time()
+ user system elapsed
+ 0.315 0.048 0.351
diff --git a/vignettes/ipred-examples.Rnw b/vignettes/ipred-examples.Rnw
new file mode 100644
index 0000000..1f98dc1
--- /dev/null
+++ b/vignettes/ipred-examples.Rnw
@@ -0,0 +1,401 @@
+\documentclass[11pt]{article}
+\usepackage[round]{natbib}
+\usepackage{bibentry}
+\usepackage{amsfonts}
+\usepackage{hyperref}
+\renewcommand{\baselinestretch}{1.3}
+\newcommand{\ipred}{\texttt{ipred }}
+
+%\VignetteIndexEntry{Some more or less useful examples for illustration.}
+%\VignetteDepends{ipred}
+%\textwidth=6.2in
+%\VignetteDepends{mvtnorm,TH.data,rpart,MASS}
+
+\begin{document}
+\title{\ipred: Improved Predictors}
+\date{}
+\SweaveOpts{engine=R,eps=TRUE,pdf=TRUE}
+
+<<preliminaries,echo=FALSE>>=
+options(prompt=">", width=50)
+set.seed(210477)
+@
+
+\maketitle
+
+This short manual is heavily based on
+\cite{Rnews:Peters+Hothorn+Lausen:2002} and needs some improvements.
+
+\section{Introduction}
+In classification problems, there are several attempts to create rules which assign future observations to
+certain classes. Common methods are for
+example linear discriminant analysis or
+classification trees. Recent developments lead to substantial reduction of misclassification error
+in many applications.
+Bootstrap aggregation \citep[``bagging'',][]{breiman:1996} combines
+classifiers trained on bootstrap samples of the original data. Another
+approach is indirect classification, which
+incorporates a priori knowledge
+into a classification rule \citep{hand:2001}.
+Since the misclassification error is a criterion to assess the
+classification techniques, its estimation is of main importance.
+A nearly unbiased but highly variable estimator can be calculated by cross validation. \cite{efron:1997} discuss bootstrap
+estimates of misclassification error.
+As a by-product of bagging, \cite{out-of-bag:1996} proposes the out-of-bag
+estimator. \\
+However, the calculation of the desired classification models and
+their misclassification errors is often aggravated by different and
+specialized interfaces of the various procedures. We propose the \ipred
+package as a first attempt to create a unified interface for improved predictors and various error rate estimators.
+In the following we demonstrate the functionality of the package
+in the example of glaucoma classification. We start with an overview
+about the disease and data and review the implemented
+classification and estimation methods in context with their
+application to glaucoma diagnosis.
+
+
+\section{Glaucoma}
+Glaucoma is a slowly processing and irreversible disease that affects
+the optic nerve head. It is the second most reason for blindness worldwide.
+Glaucoma is usually diagnosed based on a reduced visual field,
+assessed by a medical examination of perimetry and a smaller number of
+intact nerve fibers at the optic nerve head. One opportunity to examine
+the amount of intact nerve fibers is using the Heidelberg Retina
+Tomograph (HRT), a confocal laser scanning tomograph, which does a
+three dimensional topographical analysis of the optic nerve head morphology.
+
+It produces a series of $32$ images, each of $256 \times 256$ pixels,
+which are converted to a single topographic image. A less complex,
+but although a less informative examination tool is the $2$-dimensional
+fundus photography. However, in cooperation with clinicians and a
+priori analysis we derived a diagnosis of glaucoma based on three variables
+only: $w_{lora}$ represents the loss of nerve fibers and is obtained by a
+$2$-dimensional fundus photography, $w_{cs}$ and $w_{clv}$ describe the
+visual field defect \citep{ifcs:2001}.
+
+\begin{center}
+\begin{figure}[h]
+\begin{center}
+{\small
+\setlength{\unitlength}{0.6cm}
+\begin{picture}(14.5,5)
+ \put(5, 4.5){\makebox(2, 0.5){$w_{clv}\geq 5.1$}}
+ \put(2.5, 3){\makebox(2, 0.5){$w_{lora}\geq 49.23$}}
+ \put(7.5, 3){\makebox(2, 0.5){$w_{lora} \geq 58.55$}}
+\put(0, 1.5){\makebox(2, 0.5){$glaucoma$}}
+ \put(3.5, 1.5){\makebox(2, 0.5){$normal$}}
+ \put(6.5, 1.5){\makebox(2, 0.5){$w_{cs} < 1.405$}}
+ \put(10, 1.5){\makebox(2, 0.5){$normal$}}
+
+ \put(3.5, 0){\makebox(2, 0.5){$glaucoma$}}
+ \put(6.5, 0){\makebox(2, 0.5){$normal$}}
+
+ \put(6, 4.5){\vector(-3, -2){1.5}}
+ \put(6, 4.5){\vector(3, -2){1.5}}
+
+ \put(3.5, 3){\vector(3, -2){1.5}}
+ \put(3.5, 3){\vector(-3, -2){1.5}}
+ \put(8.5, 3){\vector(3, -2){1.5}}
+ \put(8.5, 3){\vector(-3, -2){1.5}}
+
+ \put(6.5, 1.5){\vector(3, -2){1.5}}
+ \put(6.5, 1.5){\vector(-3, -2){1.5}}
+\end{picture}
+}
+\end{center}
+\caption{Glaucoma diagnosis. \label{diag}}
+\end{figure}
+\end{center}
+
+Figure \ref{diag} represents the diagnosis of glaucoma in terms of a medical
+decision tree. A complication of the disease is that a damage in the
+optic nerve head morphology precedes a measurable
+visual field defect. Furthermore, an early detection
+is of main importance, since an adequate therapy can only slow down the
+progression of the disease. Hence, a classification rule for detecting
+early damages should include morphological informations, rather than
+visual field data only.
+
+Two example datasets are included in the package. The first one contains
+measurements of the eye morphology only (\texttt{GlaucomaM}), including $62$
+variables for $196$ observations. The second dataset (\texttt{GlaucomaMVF})
+contains additional visual field measurements for a different set of
+patients. In both example datasets, the observations in the two groups are
+matched by age and sex to prevent any bias.
+
+\section{Bagging}
+Referring to the example of glaucoma diagnosis we first
+demonstrate the functionality of the \texttt{bagging} function.
+We fit \texttt{nbagg = 25} (default) classification trees for bagging by
+<<bagging,echo=TRUE>>=
+library("ipred")
+library("rpart")
+library("MASS")
+data("GlaucomaM", package="TH.data")
+gbag <- bagging(Class ~ ., data = GlaucomaM, coob=TRUE)
+@
+where \texttt{GlaucomaM} contains explanatory HRT variables
+and the response of glaucoma diagnosis (\texttt{Class}),
+a factor at two levels \texttt{normal} and \texttt{glaucoma}.
+\texttt{print} returns informations about the returned object,
+i.e. the number of bootstrap replications used and, as requested by
+\texttt{coob=TRUE}, the out-of-bag estimate of misclassification error
+\citep{out-of-bag:1996}.
+<<print-bagging, echo=TRUE>>=
+print(gbag)
+@
+The out-of-bag estimate uses the observations which are left out in a
+bootstrap sample to estimate the misclassification error at almost no
+additional computational costs.
+\cite{double-bag:2002} propose to use the
+out-of-bag samples for a combination of linear discriminant analysis and
+classification trees, called ``Double-Bagging''. For example, a combination
+of a stabilised linear disciminant analysis with classification trees can be
+computed along the following lines
+<<double-bagging, echo=TRUE>>=
+scomb <- list(list(model=slda, predict=function(object, newdata)
+ predict(object, newdata)$x))
+gbagc <- bagging(Class ~ ., data = GlaucomaM, comb=scomb)
+@
+\texttt{predict} predicts future observations according to the
+fitted model.
+<<predict.bagging, echo=TRUE>>=
+predict(gbagc, newdata=GlaucomaM[c(1:3, 99:102), ])
+@
+Both \texttt{bagging} and \texttt{predict} rely on the \texttt{rpart}
+routines. The \texttt{rpart} routine for each bootstrap sample
+can be controlled in the usual way. By default \texttt{rpart.control} is used
+with \texttt{minsize=2} and \texttt{cp=0} and it is wise to turn
+cross-validation off (\texttt{xval=0}). The function \texttt{prune} can
+be used to prune each of the trees to an
+appropriate size.
+
+\section{Indirect Classification}
+Especially in a medical context it often occurs that a priori
+knowledge about a classifying structure is given. For example
+it might be known that a disease is assessed on a subgroup of
+the given variables or, moreover, that class memberships are
+assigned by a deterministically known classifying function.
+\cite{hand:2001} proposes the framework of indirect classification
+which incorporates this a priori knowledge into a classification rule.
+In this framework we subdivide a given data set into three groups of
+variables: those to be used predicting the class membership
+(explanatory), those to be used defining the class membership
+(intermediate) and the class membership variable itself (response).
+For future observations, an indirect classifier predicts values
+for the appointed intermediate variables based
+on explanatory variables only. The observation is classified
+based on their predicted intermediate variables and a fixed
+classifying function. This indirect way of classification using
+the predicted intermediate variables offers possibilities to
+incorporate a priori knowledge by the subdivision of variables and
+by the construction of a fixed classifying function.
+
+We apply indirect classification by using the function \texttt{inclass}.
+Referring to the glaucoma example, explanatory variables are HRT
+and anamnestic variables only, intermediate variables
+are $w_{lora}, \, w_{cs}$ and $w_{clv}$. The response is the
+diagnosis of glaucoma which is determined by a fixed classifying
+function and therefore not included in the learning
+sample \texttt{GlaucomaMVF}. We assign the given variables to explanatory
+and intermediate by specifying the input formula.
+<<indirect.formula, echo=TRUE>>=
+data("GlaucomaMVF", package="ipred")
+GlaucomaMVF <- GlaucomaMVF[,-63]
+formula.indirect <- Class~clv + lora + cs ~ .
+@
+The variables on the left-hand side represent the intermediate variables,
+modeled by the explanatory variables on the right-hand side. Almost each
+modeling technique can be used to predict the intermediate variables. We
+chose a linear model by \texttt{pFUN = list(list(model = lm))}.
+<<indirect.fit, echo=TRUE>>=
+classify <- function (data) {
+ attach(data)
+ res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >=
+ 49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) &
+ clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) &
+ !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) |
+ (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
+ detach(data)
+ factor (res, labels = c("glaucoma", "normal"))
+}
+fit <- inclass(formula.indirect, pFUN = list(list(model = lm)),
+ cFUN = classify, data = GlaucomaMVF)
+@
+\texttt{print} displays the subdivision of variables and the chosen
+modeling technique
+<<print.indirect, echo=TRUE>>=
+print(fit)
+@
+Furthermore, indirect classification predicts the intermediate
+variables based on the explanatory variables and classifies them
+according to a fixed classifying function in a second step, that means
+a deterministically known function for the class membership has to be
+specified. In our example this function is given in
+Figure \ref{diag} and implemented in the function \texttt{classify}.\\
+Prediction of future observations is now performed by
+<<predict.indirect, echo=TRUE>>=
+predict(object = fit, newdata = GlaucomaMVF[c(1:3, 86:88),])
+@
+We perform a bootstrap aggregated indirect classification approach by
+choosing \texttt{pFUN = bagging} and specifying the number of
+bootstrap samples \citep{ifcs:2001}. Regression or classification
+trees are fitted for each bootstrap sample, with respect to the
+measurement scale of the specified intermediate variables
+<<bagging.indirect, echo=TRUE>>=
+mypredict.rpart <- function(object, newdata) {
+ RES <- predict(object, newdata)
+ RET <- rep(NA, nrow(newdata))
+ NAMES <- rownames(newdata)
+ RET[NAMES %in% names(RES)] <- RES[NAMES[NAMES %in% names(RES)]]
+ RET
+}
+fit <- inbagg(formula.indirect, pFUN = list(list(model = rpart, predict =
+mypredict.rpart)), cFUN = classify, nbagg = 25, data = GlaucomaMVF)
+@
+The call for the prediction of values remains unchanged.
+
+
+\section{Error Rate Estimation}
+Classification rules are usually assessed by their misclassification rate.
+Hence, error rate estimation is of main importance.
+The function \texttt{errorest} implements a unified interface to several
+resampling based estimators. Referring to the example, we apply a linear
+discriminant analysis and specify the error rate estimator
+by \texttt{estimator = "cv", "boot"} or \texttt{"632plus"},
+respectively. A 10-fold cross validation is performed by
+choosing \texttt{estimator = "cv"} and
+\texttt{est.para = control.errorest(k = 10)}. The options \texttt{estimator = "boot"} or
+\texttt{estimator = "632plus"} deliver a bootstrap estimator
+and its bias corrected version {\sl .632+} \citep[see][]{efron:1997},
+we specify the number of bootstrap samples to be drawn by
+\texttt{est.para = control.errorest(nboot = 50)}.
+Further arguments are required to particularize the
+classification technique. The argument \texttt{predict} represents
+the chosen predictive function. For a unified interface
+\texttt{predict} has to be based on the arguments \texttt{object}
+and \texttt{newdata} only, therefore a wrapper function \texttt{mypredict} is necessary for classifiers
+which require more than those arguments or do not return the predicted
+classes by default. For a linear discriminant analysis with \texttt{lda}, we
+need to specify
+<<plda, echo=TRUE>>=
+mypredict.lda <- function(object, newdata){
+ predict(object, newdata = newdata)$class
+}
+@
+and calculate a 10-fold-cross-validated error rate estimator
+for a linear discriminant analysis by calling
+<<cvlda, echo=TRUE>>=
+errorest(Class ~ ., data= GlaucomaM,
+ model=lda, estimator = "cv", predict= mypredict.lda)
+@
+For the indirect approach the specification of the call becomes
+slightly more complicated.
+%Again for a unified interface a wrapper
+%function has to be used, which incorporates the fixed classification rule
+The bias corrected estimator {\sl .632+} is computed by
+<<cvindirect, echo=TRUE>>=
+errorest(formula.indirect,
+ data = GlaucomaMVF, model = inclass,
+ estimator = "632plus",
+ pFUN = list(list(model = lm)), cFUN = classify)
+@
+Because of the subdivision of variables and a formula describing the
+modeling between explanatory and intermediate variables only,
+we must call the class membership variable. Hence, in contrast to the
+function \texttt{inclass} the data set \texttt{GlaucomaMVF} used in
+\texttt{errorest} must contain explanatory, intermediate and response
+variables.
+
+Sometimes it may be necessary to reduce the number of predictors before
+training a classifier. Estimating the error rate after the variable
+selection leads to biased estimates of the misclassfication error and
+therefore one should estimate the error rate of the whole procedure. Within
+the \texttt{errorest} framework, this can be done as follows. First, we define
+a function which does both variable selection and training of the
+classifier. For illustration proposes, we select the predictors by comparing
+their univariate $P$-values of a two-sample $t$-test with a prespecified
+level and train a LDA using the selected variables only.
+
+<<varsel-def, echo=TRUE>>=
+mymod <- function(formula, data, level=0.05) {
+ # select all predictors that are associated with an
+ # univariate t.test p-value of less that level
+ sel <- which(lapply(data, function(x) {
+ if (!is.numeric(x))
+ return(1)
+ else
+ return(t.test(x ~ data$Class)$p.value)
+ }) < level)
+ # make sure that the response is still there
+ sel <- c(which(colnames(data) %in% "Class"), sel)
+ # compute a LDA using the selected predictors only
+ mod <- lda(formula , data=data[,sel])
+ # and return a function for prediction
+ function(newdata) {
+ predict(mod, newdata=newdata[,sel])$class
+ }
+}
+@
+
+Note that \texttt{mymod} does not return
+an object of class \texttt{lda} but a function
+with argument \texttt{newdata} only. Thanks to lexical scoping, this
+function is used for computing
+predicted classes instead of a function \texttt{predict} passed to
+\texttt{errorest} as argument. Computing a $5$-fold cross-validated error rate
+estimator now is approximately a one-liner.
+
+<<varsel-comp, echo=TRUE>>=
+errorest(Class ~ . , data=GlaucomaM, model=mymod, estimator = "cv",
+est.para=control.errorest(k=5))
+@
+
+
+%%To summarize the performance of the different classification techniques in the considered example of glaucoma diagnosis, the 10-fold
+%%cross-validated error estimator delivers the
+%%results given in Table \ref{tenf}.
+%%\begin{figure}
+%%\begin{center}
+%%\begin{tabular}{ rrr }
+%%\hline
+%%dataset & method & error estimate \\
+%%\hline
+%%\texttt{GlaucomaM} & {\sl slda} & 0.168 \\
+%%\texttt{GlaucomaM} & {\sl bagging} & 0.158 \\
+%%\texttt{GlaucomaM} & {\sl double-bagging} & 0.153 \\
+%%\texttt{GlaucomaMVF} & {\sl inclass-bagging} & 0.206 \\
+%%\tetxtt{GlaucomaMVF} & {\sl inclass-lm} & 0.229 \\
+%%\hline
+%%\end{tabular}
+%%\caption{10-fold cross-validated error estimation of
+%%the misclassification error for several classification
+%%methods: {\sl slda} - stabilised linear discriminant analysis,
+%%{\sl bagging} - bagging with 50 bootstrap samples,
+%%{\sl double-bagging} - bagging with 50 bootstrap samples,
+%%combined with sLDA, {\sl inclass-bagging} -
+%%indirect classification using bagging,
+%%{\sl inclass-lm} indirect classification using
+%%linear modeling. \label{tenf}}
+%%\end{center}
+%%\end{figure}
+%%Note that an estimator of the variance is available for the ordinary
+%%bootstrap estimator (\texttt{estimator="boot"}) only, see \cite{efron:1997}.
+
+
+\section{Summary}
+\ipred tries to implement a unified interface to some recent developments
+in classification and error rate estimation. It is by no means finished
+nor perfect and we very much appreciate comments, suggestions and criticism.
+Currently, the major drawback is speed. Calling \texttt{rpart} $50$
+times for each bootstrap sample is relatively inefficient
+but the design of interfaces was our main focus instead of optimization.
+Beside the examples shown, \texttt{bagging} can be used to compute bagging
+for regression trees and \texttt{errorest} computes estimators of the
+mean squared error for regression models.
+
+\bibliographystyle{plainnat}
+\bibliography{ipred}
+
+
+\end{document}
diff --git a/vignettes/ipred.bib b/vignettes/ipred.bib
new file mode 100644
index 0000000..4f7f07b
--- /dev/null
+++ b/vignettes/ipred.bib
@@ -0,0 +1,73 @@
+ at article{breiman:1996,
+ key = {53},
+ author = {L. Breiman},
+ title = {Bagging Predictors},
+ journal = {Machine Learning},
+ pages = {123-140},
+ year = {1996},
+ volume = {24},
+ number = {2}
+}
+
+ at article{efron:1997,
+ key = {52},
+ author = {B. Efron and R. Tibshirani},
+ title = {Improvements on Cross-Validation: The .632+ Bootstrap Method},
+ journal = {Journal of the American Statistical Association},
+ pages = {548-560},
+ year = {1997},
+ volume = {92},
+ number = {438}
+}
+
+ at article{hand:2001,
+ key = {32},
+ author = {D.J. Hand and H.G. Li and N.M. Adams},
+ title = {Supervised classification with structured class definitions},
+ journal = {Computational Statistics \& Data Analysis},
+ pages = {209-225},
+ year = {2001},
+ volume = {36}
+}
+
+ at inproceedings{ifcs:2001,
+ author = {A. Peters and T. Hothorn and B. Lausen},
+ title = {Glaucoma diagnosis by indirect classifiers},
+ booktitle = {Studies in Classification, Data Analysis, and Knowledge Organization (to appear)},
+ organization = {Proceedings of the 8th Conference of the International Federation of Classification Societies},
+ year = {2002}
+}
+
+ at techreport{out-of-bag:1996,
+ key = {T162},
+ author = {Leo Breiman},
+ title = {Out-Of-Bag Estimation},
+ institution = {Statistics Department, University of California Berkeley},
+ year = {1996},
+ address = {Berkeley CA 94708}
+}
+
+ at article{double-bag:2002,
+ key = {247},
+ author = {Torsten Hothorn and Berthold Lausen},
+ title = {Double-Bagging: Combining classifiers by bootstrap aggregation},
+ journal = {Pattern Recognition},
+ year = {2003},
+ pages = {1303-1309},
+ volume = {36},
+ number = {6}
+}
+
+ at article{Rnews:Peters+Hothorn+Lausen:2002,
+ key = {308},
+ author = {Andrea Peters and Torsten Hothorn and Berthold
+ Lausen},
+ title = {ipred: Improved Predictors},
+ journal = {R News},
+ year = 2002,
+ month = {June},
+ volume = 2,
+ number = 2,
+ pages = {33--36},
+ url = {http://CRAN.R-project.org/doc/Rnews/}
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/r-cran-ipred.git
More information about the debian-science-commits
mailing list