## R commands for STAT 485 ## Intermediate Statistical Techniques for Machine Learning and Big Data ## Version: 2024.01.05 ## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html ## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar" ## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/ ## Data Sets, Functions and Examples from the Book (ESL): ## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" ## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing) ## Source: https://hastie.su.domains/ElemStatLearn/download.html ## Reference: Section 3.3 in the ESL book ## install the package "ElemStatLearn" # install.packages("ElemStatLearn_2015.6.26.tar.gz", repos = NULL, type = "source") # load the package "ElemStatLearn" library("ElemStatLearn") # load the data set "prostate" train <- subset( prostate, train==TRUE )[,1:9] test <- subset( prostate, train==FALSE )[,1:9] # load the package "leaps" (you may need to install it first) # (RStudio Menu) "Tools" -> "Install Packages..." -> library("leaps") # The book (page 56) uses only training subset, so we do the same: prostate.leaps <- regsubsets( lpsa ~ . , data=train, nbest=70, #all! really.big=TRUE ) prostate.leaps.sum <- summary( prostate.leaps ) attributes(prostate.leaps.sum) prostate.models <- prostate.leaps.sum$which prostate.models.size <- as.numeric(attr(prostate.models, "dimnames")[[1]]) hist( prostate.models.size ) prostate.models.rss <- prostate.leaps.sum$rss prostate.models.best.rss <- tapply( prostate.models.rss, prostate.models.size, min ) prostate.models.best.rss # Let us add results for the only intercept model prostate.dummy <- lm( lpsa ~ 1, data=train ) prostate.models.best.rss <- c(sum(resid(prostate.dummy)^2), prostate.models.best.rss) # Making a plot (Figure 3.5 on page 58) plot( 0:8, prostate.models.best.rss, ylim=c(0, 100), type="b", xlab="subset size", ylab="Residual Sum Square", col="red2" ) points( prostate.models.size, prostate.models.rss, pch=17, col="brown",cex=0.7 ) ## check the best subset of size 4 model4 = cbind(prostate.models[prostate.models.size==4,], prostate.models.rss[prostate.models.size==4]) model4 # (Intercept) lcavol lweight age lbph svi lcp gleason pgg45 # 4 1 1 1 0 1 1 0 0 0 32.81499 # 4 1 1 1 0 0 1 0 0 1 34.07415 # 4 1 1 1 0 1 0 0 0 1 34.25679 # 4 1 1 1 0 0 1 1 0 0 34.27369 ## Conclusion: The best subset of size 4 contains {lcavol, lweight, lbph, svi}. ## check the best subset of size 2 model2 = cbind(prostate.models[prostate.models.size==2,], prostate.models.rss[prostate.models.size==2]) model2 # (Intercept) lcavol lweight age lbph svi lcp gleason pgg45 # 2 1 1 1 0 0 0 0 0 0 37.09185 # 2 1 1 0 0 1 0 0 0 0 39.99230 # 2 1 1 0 0 0 1 0 0 0 42.31258 # 2 1 1 0 0 0 0 0 0 1 43.42310 ## Conclusion: The best subset of size 2 contains {lcavol, lweight}. ## alternative solution: use "step" with AIC criterion # fit linear model on training dataset using LS method trainst <- train for(i in 1:8) { trainst[,i] <- trainst[,i] - mean(prostate[,i]); trainst[,i] <- trainst[,i]/sd(prostate[,i]); } fitls <- lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=trainst ) model.step=step(fitls) coef.step=model.step$coefficients k=length(coef.step) fitlsr.step <- lm( lpsa ~ ., data=trainst[,c(names(coef.step[2:k]),"lpsa")] ) # Conclusion: The best model based on AIC is # lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45