## R commands for STAT 485
## Intermediate Statistical Techniques for Machine Learning and Big Data
## Version: 2024.02.04
## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html


## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar"
## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/


## Data Sets, Functions and Examples from the Book (ESL): 
## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" 
## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing)
## Source: https://hastie.su.domains/ElemStatLearn/download.html

## Reference: Chapter 7 in the ESL book

## load the package  "ElemStatLearn" 
library("ElemStatLearn")

## AIC & BIC
## Sections 7.5 & 7.7 in the ESL book

## subset selection for the data set "prostate"
data(prostate)
# partition the original data into training and testing datasets
train <- subset( prostate, train==TRUE )[,1:9]
test  <- subset( prostate, train==FALSE)[,1:9]  
# standardization of predictors
trainst <- train
for(i in 1:8) {
  trainst[,i] <- trainst[,i] - mean(prostate[,i]);
  trainst[,i] <- trainst[,i]/sd(prostate[,i]);
}
testst <- test
for(i in 1:8) {
  testst[,i] <- testst[,i] - mean(prostate[,i]);
  testst[,i] <- testst[,i]/sd(prostate[,i]);
}

# full model
fitls <- lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=trainst )
AIC(fitls) # 155.0101
BIC(fitls) # 177.057

# reduced model
fitlsr <- lm( lpsa ~ lcavol+lweight+lbph+svi, data=trainst )
AIC(fitlsr) # 154.3127
BIC(fitlsr) # 167.5408


# load package "leaps" for function "regsubsets"
library("leaps")
prostate.leaps <- regsubsets( lpsa ~ . , data=trainst, nbest=70, really.big=TRUE )
prostate.leaps.sum <- summary( prostate.leaps )
names(prostate.leaps.sum)
#AIC
plot(prostate.leaps.sum$cp,xlab="Model/Subset Index",ylab="Cp/AIC",type='l')
#number of covariates
as.numeric(rownames(prostate.leaps.sum$which))
cpmin<-which.min(prostate.leaps.sum$cp)
points(cpmin,prostate.leaps.sum$cp[cpmin],col="red",cex=2,pch=20)
round(coef(prostate.leaps,cpmin),3)
# (Intercept)      lcavol     lweight         age        lbph         svi         lcp       pgg45 
#       2.467       0.676       0.265      -0.145       0.210       0.307      -0.287       0.252 

#BIC
bicmin<-which.min(prostate.leaps.sum$bic)
plot(prostate.leaps.sum$bic,xlab="Model/Subset Index",ylab="BIC",type='l')
points(bicmin,prostate.leaps.sum$bic[bicmin],col="red",cex=2,pch=20)
round(coef(prostate.leaps,bicmin),3)
# (Intercept)      lcavol     lweight 
#       2.477       0.740       0.316


## a comparison between AIC and BIC
# generate 100 random partitions
Nsimu = 100
set.seed(532)                       # specify the intital random seed
train.index <- matrix(0, Nsimu, 97) # each row indicates one set of simulated training indices
for(i in 1:Nsimu) train.index[i,]=sample(x=c(rep(1,67),rep(0,30)), size=97, replace=F)   # generate random indices of training data

# define matrices to store results
APerror <- matrix(0, Nsimu, 5)      # mean (absolute) prediction error
SPerror <- matrix(0, Nsimu, 5)      # mean (squared) prediction error
colnames(APerror)=colnames(SPerror)=c("FullModel", "ReducedModel(4)", "ReducedModel(2)", "AIC", "BIC")
Tuning <- APerror                   # record values of tuning parameters

ttemp=proc.time()                   # record computing time
for(isimu in 1:Nsimu) {             # start of loop with "isimu"
  # partition the original data into training and testing datasets
  train <- subset( prostate, train.index[isimu,]==1 )[,1:9]
  test  <- subset( prostate, train.index[isimu,]==0 )[,1:9]  
  # fit linear model on training dataset using LS method
  trainst <- train
  for(i in 1:8) {
    trainst[,i] <- trainst[,i] - mean(prostate[,i]);
    trainst[,i] <- trainst[,i]/sd(prostate[,i]);
  }
  fitls <- lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=trainst )
  fitlsr4 <- lm( lpsa ~ lcavol+lweight+lbph+svi, data=trainst )
  fitlsr2 <- lm( lpsa ~ lcavol+lweight, data=trainst )
  ## check testing errors
  testst <- test
  for(i in 1:8) {
    testst[,i] <- testst[,i] - mean(prostate[,i]);
    testst[,i] <- testst[,i]/sd(prostate[,i]);
  }
  
  ## (I) mean prediction error based on full model
  test.fitls=predict(fitls, newdata=testst)  
  # mean (absolute) prediction error
  APerror[isimu,1]=mean(abs(test[,9]-test.fitls))  
  # mean (squared) prediction error
  SPerror[isimu,1]=mean((test[,9]-test.fitls)^2)  
  
  ## (II) mean prediction error based on reduced model(4)
  test.fitlsr=predict(fitlsr4, newdata=testst)  
  # mean (absolute) prediction error
  APerror[isimu,2]=mean(abs(test[,9]-test.fitlsr))  
  # mean (squared) prediction error
  SPerror[isimu,2]=mean((test[,9]-test.fitlsr)^2)                 
  
  ## (III) mean prediction error based on reduced model(2)
  test.fitlsr=predict(fitlsr2, newdata=testst)  
  # mean (absolute) prediction error
  APerror[isimu,3]=mean(abs(test[,9]-test.fitlsr))  
  # mean (squared) prediction error
  SPerror[isimu,3]=mean((test[,9]-test.fitlsr)^2)                 
  
  ## (IV) mean prediction error based on AIC
  prostate.leaps <- regsubsets( lpsa ~ . , data=trainst, nbest=70, really.big=TRUE )
  prostate.leaps.sum <- summary( prostate.leaps )
  cpmin<-which.min(prostate.leaps.sum$cp)
  ctemp=coef(prostate.leaps,cpmin)
  Tuning[isimu,4]=length(ctemp)-1
  fit.aic=lm(lpsa ~ ., data=trainst[,c(names(ctemp[2:length(ctemp)]),"lpsa")] )
  test.aic=predict(fit.aic, newdata=testst)  
  # mean (absolute) prediction error
  APerror[isimu,4]=mean(abs(test[,9]-test.aic))  
  # mean (squared) prediction error
  SPerror[isimu,4]=mean((test[,9]-test.aic)^2)                 
  
  ## (V) mean prediction error based on BIC
  bicmin<-which.min(prostate.leaps.sum$bic)
  ctemp=coef(prostate.leaps,bicmin)
  Tuning[isimu,5]=length(ctemp)-1
  fit.bic=lm(lpsa ~ ., data=trainst[,c(names(ctemp[2:length(ctemp)]),"lpsa")] )
  test.bic=predict(fit.bic, newdata=testst)  
  # mean (absolute) prediction error
  APerror[isimu,5]=mean(abs(test[,9]-test.bic))  
  # mean (squared) prediction error
  SPerror[isimu,5]=mean((test[,9]-test.bic)^2)                 
}
proc.time()-ttemp
# user  system elapsed 
# 1.19    0.03    2.76 

## plot the output
par(mfrow=c(1,1))
# mean (absolute) prediction error
boxplot(APerror)
# mean (squared) prediction error
boxplot(SPerror)

## Tuning summary
Tuning[,1]=8 # full model
Tuning[,2]=4 # reduced model with 4 covariates
Tuning[,3]=2 # reduced model with 2 covariates
summary(Tuning)
#   FullModel ReducedModel(4) ReducedModel(2)      AIC            BIC      
# Min.   :8   Min.   :4       Min.   :2       Min.   :3.00   Min.   :1.00  
# 1st Qu.:8   1st Qu.:4       1st Qu.:2       1st Qu.:3.00   1st Qu.:3.00  
# Median :8   Median :4       Median :2       Median :4.00   Median :3.00  
# Mean   :8   Mean   :4       Mean   :2       Mean   :4.12   Mean   :2.92  
# 3rd Qu.:8   3rd Qu.:4       3rd Qu.:2       3rd Qu.:5.00   3rd Qu.:3.00  
# Max.   :8   Max.   :4       Max.   :2       Max.   :7.00   Max.   :5.00