## R commands for STAT 485 ## Intermediate Statistical Techniques for Machine Learning and Big Data ## Version: 2024.01.17 ## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html ## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar" ## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/ ## Data Sets, Functions and Examples from the Book (ESL): ## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" ## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing) ## Source: https://hastie.su.domains/ElemStatLearn/download.html ## Reference: Section 3.4 in the ESL book ## install the package "ElemStatLearn" # install.packages("ElemStatLearn_2015.6.26.tar.gz", repos = NULL, type = "source") ## install the package "glmnet" # install.packages("glmnet") ## install the package "lars" # install.packages("lars") # load the package "ElemStatLearn" library("ElemStatLearn") # load the data set "prostate" data(prostate) # partition the original data into training and testing datasets train <- subset( prostate, train==TRUE )[,1:9] test <- subset( prostate, train==FALSE)[,1:9] # standardization of predictors trainst <- train for(i in 1:8) { trainst[,i] <- trainst[,i] - mean(prostate[,i]); trainst[,i] <- trainst[,i]/sd(prostate[,i]); } testst <- test for(i in 1:8) { testst[,i] <- testst[,i] - mean(prostate[,i]); testst[,i] <- testst[,i]/sd(prostate[,i]); } ## Section 3.4.1: Ridge Regression: # Ridge regression in R is multiply implemented, at least: # MASS: lm.ridge # mda : gen.ridge # survival: ridge # Design: pentrace # mgcv: pcls (very general) # ElemStatLearn: simple.ridge # glmnet: glmnet ## use "simple.ridge" in this package prostate.ridge <- simple.ridge( trainst[,1:8], trainst[,9], df=seq(1,8,by=0.5) ) # reproduce Figure 3.8 on page 65 matplot( prostate.ridge$df, t(prostate.ridge$beta), lty=1, type="b", col="purple", pch=17, xlab=expression(paste("df(",lambda,")")), ylab="Coefficients" ) abline(0,0,lty=2) abline(v=5,lty=2) text(prostate.ridge$df[length(prostate.ridge$df)],prostate.ridge$beta[,length(prostate.ridge$df)],labels=colnames(train)[1:8]) # lambda and df(lambda) round(rbind(prostate.ridge$lambda,prostate.ridge$df),2) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] # [1,] 406.35 236.5 154.3 106.91 76.72 56.21 41.62 30.88 22.78 16.53 11.63 7.73 4.6 2.07 0 # [2,] 1.00 1.5 2.0 2.50 3.00 3.50 4.00 4.50 5.00 5.50 6.00 6.50 7.0 7.50 8 prostate.ridge$beta0 #[1] 2.452345 ## use glmnet in package glmnet library(glmnet) # use 10-fold cross-validation to choose best lambda, reproducing the Ridge Regression part of Figure 3.7 on page 62 set.seed(331) cv.out=cv.glmnet(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), nfolds=10, alpha=0, standardize=F) plot(cv.out) abline(h=cv.out$cvup[which.min(cv.out$cvm)]) # the best lambda chosen by 10-fold cross-validation lambda.10fold=cv.out$lambda.1s #[1] 1.033051 # apply Ridge regression with chosen lambda fitridge=glmnet(x=as.matrix(trainst[,1:8]),y=as.numeric(trainst[,9]),alpha=0,lambda=lambda.10fold,standardize=F,thresh=1e-12) # fitted coefficients, note that they are not the same as Table 3.3 on page 63 due to the chosen labmda and fitting algorithm fitridge$a0 # 2.458899 fitridge$beta # lcavol 0.308789518 # lweight 0.200966864 # age -0.005016066 # lbph 0.124809839 # svi 0.183984701 # lcp 0.063662855 # gleason 0.050579761 # pgg45 0.107827360 # estimating mean prediction error test.ridge=predict(fitridge,newx=as.matrix(testst[,1:8])) # mean (absolute) prediction error mean(abs(test[,9]-test.ridge)) # 0.5401661 # mean (squared) prediction error mean((test[,9]-test.ridge)^2) # 0.518376 # standard error of mean (squared) prediction error sd((test[,9]-test.ridge)^2)/sqrt(30) # 0.1818412 ## Section 3.4.2: Lasso ## need package "lasso2" for reproducing Figure 3.10 on page 70, which does not work any longer ## use glmnet in package glmnet library(glmnet) # use 10-fold cross-validation to choose best lambda, reproducing the Lasso part of Figure 3.7 on page 62 set.seed(321) cv.out=cv.glmnet(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), nfolds=10, alpha=1, standardize=F) plot(cv.out) abline(h=cv.out$cvup[which.min(cv.out$cvm)]) # the best lambda chosen by 10-fold cross-validation lambda.10fold=cv.out$lambda.1s #[1] 0.207564 # apply Lasso with chosen lambda fitlasso=glmnet(x=as.matrix(trainst[,1:8]),y=as.numeric(trainst[,9]),alpha=1,lambda=lambda.10fold,standardize=F,thresh=1e-12) # fitted coefficients, note that they are not the same as Table 3.3 on page 63 due to the chosen labmda and fitting algorithm fitlasso$a0 # 2.468333 fitlasso$beta # lcavol 0.538635531 # lweight 0.188683568 # age . # lbph . # svi 0.085822797 # lcp . # gleason . # pgg45 0.006249739 # estimating mean prediction error test.lasso=predict(fitlasso,newx=as.matrix(testst[,1:8])) # mean (absolute) prediction error mean(abs(test[,9]-test.lasso)) # 0.5101279 # mean (squared) prediction error mean((test[,9]-test.lasso)^2) # 0.4787806 # standard error of mean (squared) prediction error sd((test[,9]-test.lasso)^2)/sqrt(30) # 0.1644094 ## Section 3.4.4: LAR ## reproduce Figure 3.10 on page 70, using lars in package lars library(lars) prostate.lar <- lars(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), type="lar", trace=TRUE, normalize=F) plot(prostate.lar) # plot the lars path (similar to Figure 3.10 on page 70) # choose k using 10-fold cross-validation, "cv.lars" also generates a graph similar to Figure 3.7 on page 62 set.seed(32) # initial random seed for 10-fold CV cv.out <- cv.lars(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), K=10, plot.it=T, type="lar", trace=TRUE, normalize=F) itemp=which.min(cv.out$cv) # 8 abline(h=cv.out$cv[itemp]+cv.out$cv.error[itemp], lty=2) k.lars = min(cv.out$index[cv.out$cv < cv.out$cv[itemp]+cv.out$cv.error[itemp]]) # the chosen k = 5 abline(v=k.lars, lty=2) # estimating mean prediction error test.lars=predict(prostate.lar, newx=as.matrix(testst[,1:8]), s=k.lars, type="fit", mode=cv.out$mode)$fit # mean (absolute) prediction error mean(abs(test[,9]-test.lars)) # 0.508568 # mean (squared) prediction error mean((test[,9]-test.lars)^2) # 0.4742773 # standard error of mean (squared) prediction error sd((test[,9]-test.lars)^2)/sqrt(30) # 0.1629723