## R commands for STAT 485
## Intermediate Statistical Techniques for Machine Learning and Big Data
## Version: 2024.03.29
## Reference: http://homepages.math.uic.edu/~jyang06/stat485/stat485.html


## R package: ElemStatLearn, "ElemStatLearn_2015.6.26.2.tar"
## Source: https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/

## R code for Chapter 4 in The Elements of Statistical Learning (ESL)

## Load R package for the book
library("ElemStatLearn")

## load the training data
data(vowel.train)
str(vowel.train)
# 'data.frame':	528 obs. of  11 variables:
# $ y   : int  1 2 3 4 5 6 7 8 9 10 ...
# $ x.1 : num  -3.64 -3.33 -2.12 -2.29 -2.6 ...
# $ x.2 : num  0.418 0.496 0.894 1.809 1.938 ...
table(vowel.train$y)
#  1  2  3  4  5  6  7  8  9 10 11 
# 48 48 48 48 48 48 48 48 48 48 48 
# y=vowel.train$y
# x=as.matrix(vowel.train[,-1])

## I) Linear Discriminant Analysis, Section 4.3 in ESL
library(MASS)
lda.fit=lda(y ~ ., data=vowel.train)
lda.fit
plot(lda.fit)

## check training errors, Table 4.1, page 107
lda.pred=predict(lda.fit, vowel.train)
names(lda.pred)
lda.class=lda.pred$class
table(lda.class, vowel.train$y)
mean(lda.class!=vowel.train$y) #0.3162879

## check testing errors, Table 4.1, page 107
data(vowel.test)
lda.pred=predict(lda.fit, newdata=vowel.test)
names(lda.pred)
lda.class=lda.pred$class
table(lda.class,vowel.test$y)
mean(lda.class!=vowel.test$y) #0.5562771


## II) Quadratic Discriminant Analysis, Section 4.3, page 110
qda.fit=qda(y ~ ., data=vowel.train)
qda.fit
## check training errors, Table 4.1, page 107
qda.pred=predict(qda.fit, vowel.train)
names(qda.pred)
qda.class=qda.pred$class
table(qda.class, vowel.train$y)
mean(qda.class!=vowel.train$y) #0.01136364

## check testing errors, Table 4.1, page 107 
qda.pred=predict(qda.fit, newdata=vowel.test)
names(qda.pred)
qda.class=qda.pred$class
table(qda.class,vowel.test$y)
mean(qda.class!=vowel.test$y) #0.5281385


## III) find optimal subspaces for LDA, Section 4.3.3, pages 113~115
K <- 11              # 11 vowels or 11 classes
p <- 10              # dimension of input space
n <- dim(vowel.train)[1]                # number of observations
y <- vowel.train$y   # output or response, integer values 1, 2, ..., 11
X <- vowel.train[,2:(p+1)]   # input values

# 11 class centroids in R^10
M <- matrix(0, K, p)                    
for(i in 1:K) M[i,] <- apply(X[y==i,], 2, mean);

# calculate within-class covariance
W <- t(as.matrix(X) - M[y, ])%*%(as.matrix(X) - M[y, ])/n  

# calculate Mstar = M W^{-1/2}
temp <- svd(W)       # singular value decomposition
Wn0p5 <- temp$u %*% diag(1/sqrt(temp$d)) %*% t(temp$v) # W^{-1/2}
Mstar <- M %*% Wn0p5

# calculate covariance of Mstar
Bstar <- var(Mstar)
# alternative way
temp <- Mstar
for(i in 1:10) temp[,i] <- temp[,i]-mean(temp[,i]);
Bstar <- t(temp)%*%temp/10

# eigen-decomposition of Bstar
temp <- eigen(Bstar)
Vstar <- temp$vectors   # columns are v^*_l
V <- Wn0p5 %*% Vstar    # columns are v_l

# discriminant variables
Z <- as.matrix(X) %*% V
for(i in 1:10) Z[,i] <- Z[,i] - mean(Z[,i]);
Z <- -Z
Z[,2] <- -Z[,2]
Z[,10] <- -Z[,10]
Mnew <- matrix(0, K, p)                    
for(i in 1:K) Mnew[i,] <- apply(Z[y==i,], 2, mean);


## plot Figure 4.4 (page 107) and Figure 4.8 (page 115)
# Figure 4.4
par(mfrow=c(1,1))
i1 <- 1
i2 <- 2
plot(Z[,i1], Z[,i2], xlab="Coordinate 1 for Training Data", ylab="Coordinate 2 for Training Data", 
     main="Linear Discriminant Analysis", type="n")
for(i in 1:11) {
  points(Mnew[i,i1], Mnew[i,i2], col=i, type="p", pch="o", cex=2);
  points(Z[y==i,i1], Z[y==i,i2], col=i, type="p", pch=21, cex=1);
}

# Figure 4.8
par(mfrow=c(2,2))
i1 <- 1
i2 <- 3
plot(Z[,i1], Z[,i2], xlab="Coordinate 1", ylab="Coordinate 3", type="n")
for(i in 1:11) {
  points(Mnew[i,i1], Mnew[i,i2], col=i, type="p", pch="o", cex=1.5);
  points(Z[y==i,i1], Z[y==i,i2], col=i, type="p", pch=21, cex=0.7);
}
i1 <- 2
i2 <- 3
plot(Z[,i1], Z[,i2], xlab="Coordinate 2", ylab="Coordinate 3", type="n")
for(i in 1:11) {
  points(Mnew[i,i1], Mnew[i,i2], col=i, type="p", pch="o", cex=1.5);
  points(Z[y==i,i1], Z[y==i,i2], col=i, type="p", pch=21, cex=0.7);
}
i1 <- 1
i2 <- 7
plot(Z[,i1], Z[,i2], xlab="Coordinate 1", ylab="Coordinate 7", type="n")
for(i in 1:11) {
  points(Mnew[i,i1], Mnew[i,i2], col=i, type="p", pch="o", cex=1.5);
  points(Z[y==i,i1], Z[y==i,i2], col=i, type="p", pch=21, cex=0.7);
}
i1 <- 9
i2 <- 10
plot(Z[,i1], Z[,i2], xlab="Coordinate 9", ylab="Coordinate 10", type="n")
for(i in 1:11) {
  points(Mnew[i,i1], Mnew[i,i2], col=i, type="p", pch="o", cex=1.5);
  points(Z[y==i,i1], Z[y==i,i2], col=i, type="p", pch=21, cex=0.7);
}

## IV) Logistic regression, (4.17) on page 119, multinomial response
data(vowel.train)
# need to install the package "VGAM" first
library(VGAM)
fitvoweltrain <- vglm(y~x.1+x.2+x.3+x.4+x.5+x.6+x.7+x.8+x.9+x.10,
                      family=multinomial(), data=vowel.train)
#summary(fitvoweltrain)
fitvoweltrain@coefficients
temp=fitvoweltrain@coefficients
dim(temp)=c(10,11)
rownames(temp)=(1:10)
colnames(temp)=c("Intercept", paste("x.",1:10,sep=""))
temp
fitvoweltrain@fitted.values  

## check training errors, Table 4.1, page 107
fittedlabel <- apply(fitvoweltrain@fitted.values, 1, which.max)
sum(fittedlabel!=vowel.train$y) # 118
sum(fittedlabel!=vowel.train$y)/dim(vowel.train)[1]  # 0.2234848
par(mfrow=c(1,1))
i1 <- 1
i2 <- 2
plot(Z[,i1], Z[,i2], xlab="Coordinate 1 for Training Data", ylab="Coordinate 2 for Training Data", 
     main="Linear Discriminant Analysis", type="n")
for(i in 1:11) points(Mnew[i,i1], Mnew[i,i2], col=i, type="p", pch="o", cex=2);
text(Z[fittedlabel<10,i1], Z[fittedlabel<10,i2], labels=fittedlabel[fittedlabel<10], col=fittedlabel[fittedlabel<10], cex=0.7) 
text(Z[fittedlabel==10,i1], Z[fittedlabel==10,i2], labels="0", col=10, cex=0.7) 
text(Z[fittedlabel==11,i1], Z[fittedlabel==11,i2], labels="a", col=11, cex=0.7) 

## predict class labels of testing data 
data(vowel.test)
K <- 11              # 11 vowels or 11 classes
p <- 10              # dimension of input space
nt <- dim(vowel.test)[1]                # number of observations
yt <- vowel.test$y   # output or response, integer values 1, 2, ..., 11
Xt <- vowel.test[,2:(p+1)]   # input values
Xt1 <- cbind(rep(1, nt), Xt) # matrix [1 X], (462*11)
betafitted <- fitvoweltrain@coefficients
dim(betafitted) <- c(K-1,K)  # fitted beta values based on training data
fittedtestvalues <- matrix(1, nt, K)    # fitted Pr(u belongs to k^th class | x_u)
fittedtestvalues[,1:10] <- exp(as.matrix(Xt1) %*% t(as.matrix(betafitted)))
fittedtestvalues <- fittedtestvalues/apply(fittedtestvalues, 1, sum)

# testing error rate, Table 4.1, page 107
fittedlabelt <- apply(fittedtestvalues, 1, which.max)
sum(fittedlabelt!=vowel.test$y)  #  237
sum(fittedlabelt!=vowel.test$y)/nt  # 0.512987

# display results
logisticresult <- cbind(vowel.test$y, fittedlabelt, round(fittedtestvalues[cbind(seq(1:nt),fittedlabelt)],3), round(fittedtestvalues[cbind(seq(1:nt),vowel.test$y)],3))
colnames(logisticresult) <- c("True", "Fit", "FitProb", "TrueProb")
logisticresult

## plot training data and testing data
#postscript("vowel_train.ps",paper="special",width=8,height=6.5,horizontal=F)
par(mfrow=c(1,1))
i1 <- 1
i2 <- 2
plot(Z[,i1], Z[,i2], xlab="Coordinate 1 for Training Data", ylab="Coordinate 2 for Training Data", 
     main="Linear Discriminant Analysis", type="n")
for(i in 1:9) {
  text(Mnew[i,i1], Mnew[i,i2], labels=i, cex=2);
  text(Z[y==i,i1], Z[y==i,i2], labels=i, cex=0.8);
}
text(Mnew[10,i1], Mnew[10,i2], labels="0", cex=2);
text(Z[y==10,i1], Z[y==10,i2], labels="0", cex=0.8);
text(Mnew[11,i1], Mnew[11,i2], labels="a", cex=2);
text(Z[y==11,i1], Z[y==11,i2], labels="a", cex=0.8);
#dev.off()

Zold <- as.matrix(X) %*% V
Zt <- as.matrix(Xt) %*% V
for(i in 1:10) Zt[,i] <- Zt[,i] - mean(Zold[,i]);
Zt <- -Zt
Zt[,2] <- -Zt[,2]
Zt[,10] <- -Zt[,10]
#postscript("vowel_test.ps",paper="special",width=8,height=6.5,horizontal=F)
par(mfrow=c(1,1))
i1 <- 1
i2 <- 2
plot(append(Z[,i1],Zt[,i1]), append(Z[,i2],Zt[,i2]), xlab="Coordinate 1 for Training Data", ylab="Coordinate 2 for Training Data", 
     main="Training Centroids Plus Testing Data", type="n")
for(i in 1:9) {
  text(Mnew[i,i1], Mnew[i,i2], labels=i, cex=2);
  text(Zt[yt==i,i1], Zt[yt==i,i2], labels=i, cex=0.8);
}
text(Mnew[10,i1], Mnew[10,i2], labels="0", cex=2);
text(Zt[yt==10,i1], Zt[yt==10,i2], labels="0", cex=0.8);
text(Mnew[11,i1], Mnew[11,i2], labels="a", cex=2);
text(Zt[yt==11,i1], Zt[yt==11,i2], labels="a", cex=0.8);
#dev.off()