## R commands for STAT 485
## Intermediate Statistical Techniques for Machine Learning and Big Data
## Version: 2024.01.17
## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html


## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar"
## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/


## Data Sets, Functions and Examples from the Book (ESL): 
## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" 
## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing)
## Source: https://hastie.su.domains/ElemStatLearn/download.html

## Reference: Section 3.4 in the ESL book

## install the package  "ElemStatLearn"
# install.packages("ElemStatLearn_2015.6.26.tar.gz", repos = NULL, type = "source")
## install the package "glmnet" 
# install.packages("glmnet")
## install the package "lars" 
# install.packages("lars")

# load the package  "ElemStatLearn" 
library("ElemStatLearn")

# load the data set "prostate"
data(prostate)

# partition the original data into training and testing datasets
train <- subset( prostate, train==TRUE )[,1:9]
test  <- subset( prostate, train==FALSE)[,1:9]  

# standardization of predictors
trainst <- train
for(i in 1:8) {
  trainst[,i] <- trainst[,i] - mean(prostate[,i]);
  trainst[,i] <- trainst[,i]/sd(prostate[,i]);
}
testst <- test
for(i in 1:8) {
  testst[,i] <- testst[,i] - mean(prostate[,i]);
  testst[,i] <- testst[,i]/sd(prostate[,i]);
}


## Section 3.4.1: Ridge Regression: 
# Ridge regression in R is multiply implemented, at least:
# MASS: lm.ridge
# mda : gen.ridge
# survival: ridge
# Design: pentrace
# mgcv: pcls (very general)
# ElemStatLearn: simple.ridge
# glmnet: glmnet


## use "simple.ridge" in this package
prostate.ridge <- simple.ridge( trainst[,1:8], trainst[,9], df=seq(1,8,by=0.5) )

# reproduce Figure 3.8 on page 65
matplot( prostate.ridge$df, t(prostate.ridge$beta), lty=1, type="b", col="purple", pch=17, xlab=expression(paste("df(",lambda,")")), ylab="Coefficients" )
abline(0,0,lty=2)
abline(v=5,lty=2)
text(prostate.ridge$df[length(prostate.ridge$df)],prostate.ridge$beta[,length(prostate.ridge$df)],labels=colnames(train)[1:8])
# lambda and df(lambda)
round(rbind(prostate.ridge$lambda,prostate.ridge$df),2)
#        [,1]  [,2]  [,3]   [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12] [,13] [,14] [,15]
# [1,] 406.35 236.5 154.3 106.91 76.72 56.21 41.62 30.88 22.78 16.53 11.63  7.73   4.6  2.07     0
# [2,]   1.00   1.5   2.0   2.50  3.00  3.50  4.00  4.50  5.00  5.50  6.00  6.50   7.0  7.50     8
prostate.ridge$beta0
#[1] 2.452345


## use glmnet in package glmnet
library(glmnet)
# use 10-fold cross-validation to choose best lambda, reproducing the Ridge Regression part of Figure 3.7 on page 62
set.seed(331)
cv.out=cv.glmnet(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), nfolds=10, alpha=0, standardize=F)
plot(cv.out)
abline(h=cv.out$cvup[which.min(cv.out$cvm)])
# the best lambda chosen by 10-fold cross-validation
lambda.10fold=cv.out$lambda.1s   #[1] 1.033051

# apply Ridge regression with chosen lambda
fitridge=glmnet(x=as.matrix(trainst[,1:8]),y=as.numeric(trainst[,9]),alpha=0,lambda=lambda.10fold,standardize=F,thresh=1e-12)
# fitted coefficients, note that they are not the same as Table 3.3 on page 63 due to the chosen labmda and fitting algorithm
fitridge$a0   # 2.458899  
fitridge$beta
# lcavol   0.308789518
# lweight  0.200966864
# age     -0.005016066
# lbph     0.124809839
# svi      0.183984701
# lcp      0.063662855
# gleason  0.050579761
# pgg45    0.107827360

# estimating mean prediction error
test.ridge=predict(fitridge,newx=as.matrix(testst[,1:8]))
# mean (absolute) prediction error
mean(abs(test[,9]-test.ridge))                # 0.5401661
# mean (squared) prediction error
mean((test[,9]-test.ridge)^2)                 # 0.518376
# standard error of mean (squared) prediction error
sd((test[,9]-test.ridge)^2)/sqrt(30)          # 0.1818412


## Section 3.4.2: Lasso
## need package "lasso2" for reproducing Figure 3.10 on page 70, which does not work any longer

## use glmnet in package glmnet
library(glmnet)
# use 10-fold cross-validation to choose best lambda, reproducing the Lasso part of Figure 3.7 on page 62
set.seed(321)
cv.out=cv.glmnet(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), nfolds=10, alpha=1, standardize=F)
plot(cv.out)
abline(h=cv.out$cvup[which.min(cv.out$cvm)])
# the best lambda chosen by 10-fold cross-validation
lambda.10fold=cv.out$lambda.1s   #[1] 0.207564

# apply Lasso with chosen lambda
fitlasso=glmnet(x=as.matrix(trainst[,1:8]),y=as.numeric(trainst[,9]),alpha=1,lambda=lambda.10fold,standardize=F,thresh=1e-12)
# fitted coefficients, note that they are not the same as Table 3.3 on page 63 due to the chosen labmda and fitting algorithm
fitlasso$a0 # 2.468333
fitlasso$beta
# lcavol  0.538635531
# lweight 0.188683568
# age     .          
# lbph    .          
# svi     0.085822797
# lcp     .          
# gleason .          
# pgg45   0.006249739

# estimating mean prediction error
test.lasso=predict(fitlasso,newx=as.matrix(testst[,1:8]))
# mean (absolute) prediction error
mean(abs(test[,9]-test.lasso))                # 0.5101279
# mean (squared) prediction error
mean((test[,9]-test.lasso)^2)                 # 0.4787806
# standard error of mean (squared) prediction error
sd((test[,9]-test.lasso)^2)/sqrt(30)          # 0.1644094

## Section 3.4.4: LAR
## reproduce Figure 3.10 on page 70, using lars in package lars
library(lars)
prostate.lar <- lars(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), type="lar", trace=TRUE, normalize=F)
plot(prostate.lar)   # plot the lars path (similar to Figure 3.10 on page 70)

# choose k using 10-fold cross-validation, "cv.lars" also generates a graph similar to Figure 3.7 on page 62
set.seed(32)   # initial random seed for 10-fold CV
cv.out <- cv.lars(x=as.matrix(trainst[,1:8]), y=as.numeric(trainst[,9]), K=10, plot.it=T, type="lar", trace=TRUE, normalize=F)
itemp=which.min(cv.out$cv) # 8
abline(h=cv.out$cv[itemp]+cv.out$cv.error[itemp], lty=2)
k.lars = min(cv.out$index[cv.out$cv < cv.out$cv[itemp]+cv.out$cv.error[itemp]])  # the chosen k = 5
abline(v=k.lars, lty=2)   

# estimating mean prediction error
test.lars=predict(prostate.lar, newx=as.matrix(testst[,1:8]), s=k.lars, type="fit", mode=cv.out$mode)$fit
# mean (absolute) prediction error
mean(abs(test[,9]-test.lars))                # 0.508568
# mean (squared) prediction error
mean((test[,9]-test.lars)^2)                 # 0.4742773
# standard error of mean (squared) prediction error
sd((test[,9]-test.lars)^2)/sqrt(30)          # 0.1629723