## R commands for STAT 485 ## Intermediate Statistical Techniques for Machine Learning and Big Data ## Version: 2024.02.04 ## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html ## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar" ## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/ ## Data Sets, Functions and Examples from the Book (ESL): ## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" ## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing) ## Source: https://hastie.su.domains/ElemStatLearn/download.html ## Reference: Chapter 7 in the ESL book ## load the package "ElemStatLearn" library("ElemStatLearn") ## AIC & BIC ## Sections 7.5 & 7.7 in the ESL book ## subset selection for the data set "prostate" data(prostate) # partition the original data into training and testing datasets train <- subset( prostate, train==TRUE )[,1:9] test <- subset( prostate, train==FALSE)[,1:9] # standardization of predictors trainst <- train for(i in 1:8) { trainst[,i] <- trainst[,i] - mean(prostate[,i]); trainst[,i] <- trainst[,i]/sd(prostate[,i]); } testst <- test for(i in 1:8) { testst[,i] <- testst[,i] - mean(prostate[,i]); testst[,i] <- testst[,i]/sd(prostate[,i]); } # full model fitls <- lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=trainst ) AIC(fitls) # 155.0101 BIC(fitls) # 177.057 # reduced model fitlsr <- lm( lpsa ~ lcavol+lweight+lbph+svi, data=trainst ) AIC(fitlsr) # 154.3127 BIC(fitlsr) # 167.5408 # load package "leaps" for function "regsubsets" library("leaps") prostate.leaps <- regsubsets( lpsa ~ . , data=trainst, nbest=70, really.big=TRUE ) prostate.leaps.sum <- summary( prostate.leaps ) names(prostate.leaps.sum) #AIC plot(prostate.leaps.sum$cp,xlab="Model/Subset Index",ylab="Cp/AIC",type='l') #number of covariates as.numeric(rownames(prostate.leaps.sum$which)) cpmin<-which.min(prostate.leaps.sum$cp) points(cpmin,prostate.leaps.sum$cp[cpmin],col="red",cex=2,pch=20) round(coef(prostate.leaps,cpmin),3) # (Intercept) lcavol lweight age lbph svi lcp pgg45 # 2.467 0.676 0.265 -0.145 0.210 0.307 -0.287 0.252 #BIC bicmin<-which.min(prostate.leaps.sum$bic) plot(prostate.leaps.sum$bic,xlab="Model/Subset Index",ylab="BIC",type='l') points(bicmin,prostate.leaps.sum$bic[bicmin],col="red",cex=2,pch=20) round(coef(prostate.leaps,bicmin),3) # (Intercept) lcavol lweight # 2.477 0.740 0.316 ## a comparison between AIC and BIC # generate 100 random partitions Nsimu = 100 set.seed(532) # specify the intital random seed train.index <- matrix(0, Nsimu, 97) # each row indicates one set of simulated training indices for(i in 1:Nsimu) train.index[i,]=sample(x=c(rep(1,67),rep(0,30)), size=97, replace=F) # generate random indices of training data # define matrices to store results APerror <- matrix(0, Nsimu, 5) # mean (absolute) prediction error SPerror <- matrix(0, Nsimu, 5) # mean (squared) prediction error colnames(APerror)=colnames(SPerror)=c("FullModel", "ReducedModel(4)", "ReducedModel(2)", "AIC", "BIC") Tuning <- APerror # record values of tuning parameters ttemp=proc.time() # record computing time for(isimu in 1:Nsimu) { # start of loop with "isimu" # partition the original data into training and testing datasets train <- subset( prostate, train.index[isimu,]==1 )[,1:9] test <- subset( prostate, train.index[isimu,]==0 )[,1:9] # fit linear model on training dataset using LS method trainst <- train for(i in 1:8) { trainst[,i] <- trainst[,i] - mean(prostate[,i]); trainst[,i] <- trainst[,i]/sd(prostate[,i]); } fitls <- lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=trainst ) fitlsr4 <- lm( lpsa ~ lcavol+lweight+lbph+svi, data=trainst ) fitlsr2 <- lm( lpsa ~ lcavol+lweight, data=trainst ) ## check testing errors testst <- test for(i in 1:8) { testst[,i] <- testst[,i] - mean(prostate[,i]); testst[,i] <- testst[,i]/sd(prostate[,i]); } ## (I) mean prediction error based on full model test.fitls=predict(fitls, newdata=testst) # mean (absolute) prediction error APerror[isimu,1]=mean(abs(test[,9]-test.fitls)) # mean (squared) prediction error SPerror[isimu,1]=mean((test[,9]-test.fitls)^2) ## (II) mean prediction error based on reduced model(4) test.fitlsr=predict(fitlsr4, newdata=testst) # mean (absolute) prediction error APerror[isimu,2]=mean(abs(test[,9]-test.fitlsr)) # mean (squared) prediction error SPerror[isimu,2]=mean((test[,9]-test.fitlsr)^2) ## (III) mean prediction error based on reduced model(2) test.fitlsr=predict(fitlsr2, newdata=testst) # mean (absolute) prediction error APerror[isimu,3]=mean(abs(test[,9]-test.fitlsr)) # mean (squared) prediction error SPerror[isimu,3]=mean((test[,9]-test.fitlsr)^2) ## (IV) mean prediction error based on AIC prostate.leaps <- regsubsets( lpsa ~ . , data=trainst, nbest=70, really.big=TRUE ) prostate.leaps.sum <- summary( prostate.leaps ) cpmin<-which.min(prostate.leaps.sum$cp) ctemp=coef(prostate.leaps,cpmin) Tuning[isimu,4]=length(ctemp)-1 fit.aic=lm(lpsa ~ ., data=trainst[,c(names(ctemp[2:length(ctemp)]),"lpsa")] ) test.aic=predict(fit.aic, newdata=testst) # mean (absolute) prediction error APerror[isimu,4]=mean(abs(test[,9]-test.aic)) # mean (squared) prediction error SPerror[isimu,4]=mean((test[,9]-test.aic)^2) ## (V) mean prediction error based on BIC bicmin<-which.min(prostate.leaps.sum$bic) ctemp=coef(prostate.leaps,bicmin) Tuning[isimu,5]=length(ctemp)-1 fit.bic=lm(lpsa ~ ., data=trainst[,c(names(ctemp[2:length(ctemp)]),"lpsa")] ) test.bic=predict(fit.bic, newdata=testst) # mean (absolute) prediction error APerror[isimu,5]=mean(abs(test[,9]-test.bic)) # mean (squared) prediction error SPerror[isimu,5]=mean((test[,9]-test.bic)^2) } proc.time()-ttemp # user system elapsed # 1.19 0.03 2.76 ## plot the output par(mfrow=c(1,1)) # mean (absolute) prediction error boxplot(APerror) # mean (squared) prediction error boxplot(SPerror) ## Tuning summary Tuning[,1]=8 # full model Tuning[,2]=4 # reduced model with 4 covariates Tuning[,3]=2 # reduced model with 2 covariates summary(Tuning) # FullModel ReducedModel(4) ReducedModel(2) AIC BIC # Min. :8 Min. :4 Min. :2 Min. :3.00 Min. :1.00 # 1st Qu.:8 1st Qu.:4 1st Qu.:2 1st Qu.:3.00 1st Qu.:3.00 # Median :8 Median :4 Median :2 Median :4.00 Median :3.00 # Mean :8 Mean :4 Mean :2 Mean :4.12 Mean :2.92 # 3rd Qu.:8 3rd Qu.:4 3rd Qu.:2 3rd Qu.:5.00 3rd Qu.:3.00 # Max. :8 Max. :4 Max. :2 Max. :7.00 Max. :5.00