## R commands for STAT 485
## Intermediate Statistical Techniques for Machine Learning and Big Data
## Version: 2024.01.26
## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html


## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar"
## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/


## Data Sets, Functions and Examples from the Book (ESL): 
## "The Elements of Statistical Learning, Data Mining, Inference, and Prediction" 
## by Trevor Hastie, Robert Tibshirani and Jerome Friedman, Second Edition, 2017 (corrected 12th printing)
## Source: https://hastie.su.domains/ElemStatLearn/download.html

## Reference: Section 3.5 in the ESL book

## install the package  "ElemStatLearn"
# install.packages("ElemStatLearn_2015.6.26.tar.gz", repos = NULL, type = "source")
## install the package  "pls"
# install.packages("pls")

## Principal component analysis using R
## Reference: https://www.cs.princeton.edu/picasso/mats/PCA-Tutorial-Intuition_jp.pdf

## A toy example
N <- 300         # number of data points
set.seed(323)
z1 <- rnorm(N, mean=0, sd=3)
z2 <- rnorm(N, mean=0, sd=1)
x1 <- z1 - z2
x2 <- z1 + z2
plot(x1, x2)
# 1st principal component: (a1, a2) maximizing var(a1*x1 + a2*x2), subject a1^2 + a2^2 = 1
abline(0,1)  # that is, (a1, a2)=(1, 1)/sqrt(2)
# 2nd principal component: (b1, b2) maximizing var(b1*x1 + b2*x2), subject to b1^2 + b2^2 = 1 and cov(a1*x1+a2*x2, b1*x1+b2*x2)=0
abline(0,-1) # that is, (b1, b2)=(-1, 1)/sqrt(2)

# using R function "prcomp"
pca = prcomp(cbind(x1,x2))  # applying principal component analysis
plot(pca) 
pca
# Standard deviations:
#   [1] 4.413913 1.401510
# Rotation:
#   PC1        PC2
# x1 0.7018288 -0.7123457
# x2 0.7123457  0.7018288


## A real example
## Reference https://www.r-bloggers.com/principal-component-analysis-using-r/
data("crimtab") #load data: Student's 3000 Criminals Data

dim(crimtab)    # check dimensions, rows: finger lengths; columns: body heights
#[1] 42 22
sum(crimtab)    # 3000

plot(as.numeric(colnames(crimtab)), apply(crimtab,2,var),type="b", xlab="height", ylab="variance") #check the variance accross the variables

pca = prcomp(crimtab) #applying principal component analysis on crimtab data
#pca$sdev
plot(pca) # the top two principal components may be chosen


## load the package  "ElemStatLearn" 
library("ElemStatLearn")

# load the data set "prostate"
data(prostate)

# partition the original data into training and testing datasets
train <- subset( prostate, train==TRUE )[,1:9]
test  <- subset( prostate, train==FALSE)[,1:9]  
# Note: There is a typo in "prostate" help file in "ElemStatLearn" package
# In "test  <- subset( prostate, train=FALSE )[,1:9]", "=" should be "=="

# standardization of predictors
trainst <- train
for(i in 1:8) {
  trainst[,i] <- trainst[,i] - mean(prostate[,i]);
  trainst[,i] <- trainst[,i]/sd(prostate[,i]);
}
testst <- test
for(i in 1:8) {
  testst[,i] <- testst[,i] - mean(prostate[,i]);
  testst[,i] <- testst[,i]/sd(prostate[,i]);
}

#### Principal Components Regression
library(pls)
set.seed(2)
pcr.fit=pcr(lpsa~., data=trainst, scale=F, validation="CV", segments=10)
summary(pcr.fit)
validationplot(pcr.fit,val.type="MSEP")
# find the best number of components, regenerating part of Figure 3.7 on page 62
itemp=which.min(pcr.fit$validation$PRESS)     # 8
itemp.mean=pcr.fit$validation$PRESS[itemp]/67 # 0.5892642
mean((pcr.fit$validation$pred[,,itemp]-trainst[,9])^2) # 0.5892642
itemp.sd=sd((pcr.fit$validation$pred[,,itemp]-trainst[,9])^2)/sqrt(67)   # 0.1115468
abline(h=itemp.mean+itemp.sd, lty=2)
k.pcr = min((1:pcr.fit$validation$ncomp)[pcr.fit$validation$PRESS/67 < itemp.mean+itemp.sd])  # the chosen k = 3
abline(v=k.pcr, lty=2)   
pcr.fit$coefficients[,,k.pcr]   # fitted model with chosen number of components
#     lcavol     lweight         age        lbph         svi 
# 0.28666128  0.33910369  0.05628529  0.10152838  0.26148505 
#        lcp     gleason       pgg45 
# 0.21868062 -0.01605594  0.06170971 

# estimating mean prediction error
test.pcr=predict(pcr.fit,as.matrix(testst[,1:8]),ncomp=k.pcr)
# mean (absolute) prediction error
mean(abs(test[,9]-test.pcr))                # 0.5513026
# mean (squared) prediction error
mean((test[,9]-test.pcr)^2)                 # 0.4956846
# standard error of mean (squared) prediction error
sd((test[,9]-test.pcr)^2)/sqrt(30)          # 0.1328556


#### Partial Least Squares using R
#### Section 3.5.2 in ESL book, or Section 6.3.2 in ISL book (An Introduction to Statistical Learning: with Applications in R)
library(pls)
set.seed(1)
plsr.fit=plsr(lpsa~., data=trainst, scale=F, validation="CV", segments=10)
summary(plsr.fit)
validationplot(plsr.fit,val.type="MSEP")
# find the best number of components, regenerating part of Figure 3.7 on page 62
itemp=which.min(plsr.fit$validation$PRESS)     # 5
itemp.mean=plsr.fit$validation$PRESS[itemp]/67 # 0.6162555
mean((plsr.fit$validation$pred[,,itemp]-trainst[,9])^2) # 0.6162555
itemp.sd=sd((plsr.fit$validation$pred[,,itemp]-trainst[,9])^2)/sqrt(67)   # 0.1150495
abline(h=itemp.mean+itemp.sd, lty=2)
k.plsr = min((1:plsr.fit$validation$ncomp)[plsr.fit$validation$PRESS/67 < itemp.mean+itemp.sd])  # the chosen k = 2
abline(v=k.plsr, lty=2)   
plsr.fit$coefficients[,,k.plsr]   # fitted model with chosen number of components
#     lcavol     lweight         age        lbph         svi 
# 0.41925335  0.34486789 -0.02588107  0.21992197  0.24319848 
#        lcp     gleason       pgg45 
# 0.07845299  0.01083593  0.08372182 

# estimating mean prediction error
test.plsr=predict(plsr.fit,as.matrix(testst[,1:8]),ncomp=k.plsr)
# mean (absolute) prediction error
mean(abs(test[,9]-test.plsr))                # 0.5548436
# mean (squared) prediction error
mean((test[,9]-test.plsr)^2)                 # 0.526937
# standard error of mean (squared) prediction error
sd((test[,9]-test.plsr)^2)/sqrt(30)          # 0.1503803