## R commands for STAT 485
## Intermediate Statistical Techniques for Machine Learning and Big Data
## Version: 2024.01.05
## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html


## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar"
## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/

## Data Sets, Functions and Examples from the Book (ESL): 
## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" 
## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing)
## Source: https://hastie.su.domains/ElemStatLearn/download.html

## Reference: Section 3.2.1 in the ESL book

## Set Working Directory To Source File Location
## (RStudio Menu) "Session" -> "Set Working Directory" -> "To Source File Location"
setwd("~/teaching/stat485/R")

## install the package  "ElemStatLearn"
install.packages("ElemStatLearn_2015.6.26.2.tar.gz", repos = NULL, type = "source")

# load the package  "ElemStatLearn" 
library("ElemStatLearn")

# load the data set "prostate"
data(prostate)
str( prostate )
# 'data.frame':	97 obs. of  10 variables:
# $ lcavol : num  -0.58 -0.994 -0.511 -1.204 0.751 ...
# $ lweight: num  2.77 3.32 2.69 3.28 3.43 ...
# $ age    : int  50 58 74 58 62 50 64 58 47 63 ...
# $ lbph   : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
# $ svi    : int  0 0 0 0 0 0 0 0 0 0 ...
# $ lcp    : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
# $ gleason: int  6 6 7 6 6 6 6 6 6 6 ...
# $ pgg45  : int  0 0 20 0 0 0 0 0 0 0 ...
# $ lpsa   : num  -0.431 -0.163 -0.163 -0.163 0.372 ...
# $ train  : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
table(prostate$train)
# FALSE  TRUE 
# 30    67

# partition the original data into training and testing datasets
train <- subset( prostate, train==TRUE )[,1:9]
test  <- subset( prostate, train==FALSE)[,1:9]  
# Note: There is a typo in "prostate" help file in "ElemStatLearn" package
# In "test  <- subset( prostate, train=FALSE )[,1:9]", "=" should be "=="

# check correlations
cor( prostate[,1:8] )
# reproduce Table 3.1 on page 50
round(cor( train[,1:8] ),3)

# scatter plot, reproducing Figure 1.1 on page 3
pairs( prostate[,1:9], col="violet" )

# standardize the training data
trainst <- train   
for(i in 1:8) {
  trainst[,i] <- trainst[,i] - mean(prostate[,i]);
  trainst[,i] <- trainst[,i]/sd(prostate[,i]);
}

# fit linear model on training dataset using LS method
fitls <- lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=trainst )

# reproduce Table 3.2 on page 50, as well as some numbers in Table 3.3 on page 63
summary(fitls)
#               Estimate Std. Error t value Pr(>|t|)    
#   (Intercept)  2.46493    0.08931  27.598  < 2e-16 ***
#   lcavol       0.67953    0.12663   5.366 1.47e-06 ***
#   lweight      0.26305    0.09563   2.751  0.00792 ** 
#   age         -0.14146    0.10134  -1.396  0.16806    
#   lbph         0.21015    0.10222   2.056  0.04431 *  
#   svi          0.30520    0.12360   2.469  0.01651 *  
#   lcp         -0.28849    0.15453  -1.867  0.06697 .  
#   gleason     -0.02131    0.14525  -0.147  0.88389    
#   pgg45        0.26696    0.15361   1.738  0.08755 .  
# ---
#   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 0.7123 on 58 degrees of freedom
# Multiple R-squared:  0.6944,	Adjusted R-squared:  0.6522 
# F-statistic: 16.47 on 8 and 58 DF,  p-value: 2.042e-12

## Conclusion: According their p-values "Pr(>|t|)", 
##             age, lcp, gleason, pgg45 are not significant at 5% significance level.

## Question: Could we remove all the four predictors simultaneously?
## Answer: Use the partial F-test
## [1] Fit the full model and get RSSfull=0.7123^2*58=29.43
## [2] Fit the reduced model:
fitlsr <- lm( lpsa ~ lcavol+lweight+lbph+svi, data=trainst )
summary(fitlsr)
#               Estimate Std. Error t value Pr(>|t|)    
#   (Intercept)  2.47142    0.08901  27.766  < 2e-16 ***
#   lcavol       0.59582    0.10910   5.461 8.85e-07 ***
#   lweight      0.23084    0.09456   2.441   0.0175 *  
#   lbph         0.20313    0.10215   1.988   0.0512 .  
#   svi          0.27814    0.11311   2.459   0.0167 *  
#   ---
#   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 0.7275 on 62 degrees of freedom
# Multiple R-squared:  0.6592,	Adjusted R-squared:  0.6372 
# F-statistic: 29.98 on 4 and 62 DF,  p-value: 6.911e-14
##  then RSSreduced=0.7275^2*62=32.81
## [3] Test statistic: F=((32.81-29.43)/(62-58))/(29.43/58)=1.67
## [4] p-value:
1-pf(1.67, df1=4, df2=58)
## 0.1692794

## check testing errors
# standardize testing data at the same scale of training data
testst <- test
for(i in 1:8) {
  testst[,i] <- testst[,i] - mean(prostate[,i]);
  testst[,i] <- testst[,i]/sd(prostate[,i]);
}

# mean prediction error on testing data using mean training value
mean(trainst[,9])                             # 2.452345
# mean (absolute) prediction error
mean(abs(testst[,9]-mean(trainst[,9])))       # 0.7409334
# mean (squared) prediction error
mean((testst[,9]-mean(trainst[,9]))^2)        # 1.056733
# standard error of mean (squared) prediction error
sd((testst[,9]-mean(trainst[,9]))^2)/sqrt(30) # 0.396115

# mean prediction error based on full model, reproducing some numbers in Table 3.3 on page 63
test.fitls=predict(fitls, newdata=testst)  
# mean (absolute) prediction error
mean(abs(test[,9]-test.fitls))                # 0.5233719
# mean (squared) prediction error
mean((test[,9]-test.fitls)^2)                 # 0.521274
# standard error of mean (squared) prediction error
sd((test[,9]-test.fitls)^2)/sqrt(30)          #  0.178724


# mean prediction error based on reduced model
test.fitlsr=predict(fitlsr, newdata=testst)  
# mean (absolute) prediction error
mean(abs(test[,9]-test.fitlsr))                # 0.5139785
# mean (squared) prediction error
mean((test[,9]-test.fitlsr)^2)                 # 0.4563321
# standard error of mean (squared) prediction error
sd((test[,9]-test.fitlsr)^2)/sqrt(30)          # 0.1242902