## R commands for STAT 485
## Intermediate Statistical Techniques for Machine Learning and Big Data
## Version: 2024.01.05
## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html


## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar"
## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/

## Data Sets, Functions and Examples from the Book (ESL): 
## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" 
## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing)
## Source: https://hastie.su.domains/ElemStatLearn/download.html

## Reference: Section 3.3 in the ESL book

## install the package  "ElemStatLearn"
# install.packages("ElemStatLearn_2015.6.26.tar.gz", repos = NULL, type = "source")

# load the package  "ElemStatLearn" 
library("ElemStatLearn")

# load the data set "prostate"
train <- subset( prostate, train==TRUE )[,1:9]
test  <- subset( prostate, train==FALSE )[,1:9]

# load the package "leaps" (you may need to install it first)
# (RStudio Menu) "Tools" -> "Install Packages..." ->
library("leaps")

# The book (page 56) uses only training subset, so we do the same:
prostate.leaps <- regsubsets( lpsa ~ . , data=train, nbest=70, #all!
                                really.big=TRUE )
prostate.leaps.sum <- summary( prostate.leaps )
attributes(prostate.leaps.sum)

prostate.models <- prostate.leaps.sum$which
prostate.models.size <- as.numeric(attr(prostate.models, "dimnames")[[1]])
hist( prostate.models.size )

prostate.models.rss <- prostate.leaps.sum$rss
prostate.models.best.rss <- tapply( prostate.models.rss, prostate.models.size, min )
prostate.models.best.rss

# Let us add results for the only intercept model
prostate.dummy <- lm( lpsa ~ 1, data=train )
prostate.models.best.rss <- c(sum(resid(prostate.dummy)^2), 
    prostate.models.best.rss)

# Making a plot (Figure 3.5 on page 58)
plot( 0:8, prostate.models.best.rss, ylim=c(0, 100), 
        type="b", xlab="subset size", ylab="Residual Sum Square", 
        col="red2" )
points( prostate.models.size, prostate.models.rss, pch=17, col="brown",cex=0.7 )

## check the best subset of size 4
model4 = cbind(prostate.models[prostate.models.size==4,], prostate.models.rss[prostate.models.size==4])
model4
#   (Intercept) lcavol lweight age lbph svi lcp gleason pgg45         
# 4           1      1       1   0    1   1   0       0     0 32.81499
# 4           1      1       1   0    0   1   0       0     1 34.07415
# 4           1      1       1   0    1   0   0       0     1 34.25679
# 4           1      1       1   0    0   1   1       0     0 34.27369
## Conclusion: The best subset of size 4 contains {lcavol, lweight, lbph, svi}.

## check the best subset of size 2
model2 = cbind(prostate.models[prostate.models.size==2,], prostate.models.rss[prostate.models.size==2])
model2
#   (Intercept) lcavol lweight age lbph svi lcp gleason pgg45         
# 2           1      1       1   0    0   0   0       0     0 37.09185
# 2           1      1       0   0    1   0   0       0     0 39.99230
# 2           1      1       0   0    0   1   0       0     0 42.31258
# 2           1      1       0   0    0   0   0       0     1 43.42310
## Conclusion: The best subset of size 2 contains {lcavol, lweight}.

## alternative solution: use "step" with AIC criterion
# fit linear model on training dataset using LS method
trainst <- train
for(i in 1:8) {
  trainst[,i] <- trainst[,i] - mean(prostate[,i]);
  trainst[,i] <- trainst[,i]/sd(prostate[,i]);
}
fitls <- lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=trainst )
model.step=step(fitls)
coef.step=model.step$coefficients
k=length(coef.step)
fitlsr.step <- lm( lpsa ~ ., data=trainst[,c(names(coef.step[2:k]),"lpsa")] )
# Conclusion: The best model based on AIC is 
#    lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45