## R code for k-nearest neighbor classifier ## Source: http://www-bcf.usc.edu/~gareth/ISL/Chapter%204%20Lab.txt ## updated on 4/24/2024 # The Stock Market Data (S&P Stock Market Data) library(ISLR) names(Smarket) dim(Smarket) #[1] 1250 9 summary(Smarket) pairs(Smarket) cor(Smarket) cor(Smarket[,-9]) attach(Smarket) plot(Volume) library(class) train=(1:100) # added 4/24/2024 train.X=cbind(Lag1,Lag2)[train,] test.X=cbind(Lag1,Lag2)[-train,] train.Direction=Direction[train] set.seed(1) knn.pred=knn(train.X,test.X,train.Direction,k=1) table(knn.pred,Direction[-train]) (223+346)/1150 # 0.4947826 knn.pred=knn(train.X,test.X,train.Direction,k=3) table(knn.pred,Direction[-train]) mean(knn.pred==Direction[-train]) #0.486087 ## use more covariates train.X=cbind(Lag1,Lag2,Lag3,Lag4,Lag5,Today)[train,] test.X=cbind(Lag1,Lag2,Lag3,Lag4,Lag5,Today)[-train,] train.Direction=Direction[train] set.seed(1) knn.pred=knn(train.X,test.X,train.Direction,k=1) table(knn.pred,Direction[-train]) mean(knn.pred==Direction[-train]) # 0.793913 knn.pred=knn(train.X,test.X,train.Direction,k=3) table(knn.pred,Direction[-train]) mean(knn.pred==Direction[-train]) # 0.8104348 ### R function to find the optimal number of nearest neibors in k-nearest neighbor method ### based on leave-one-out cross validation using number of misclassified labels ### Functions needed: "knn.cv" in package "class" ### Input: X (nobs*nx), class (nobs*1, in the form of 1:n), knnvec (vector of numbers of nearest neibors) ### Output: k.optimal (optimal number of nearest neighbors), error.vec (vector of numbers of errors) ### warning message if optimal k is at the edge of candidates Knncv.leaveoneout.self <- function(X, class, knnvec) { knum <- length(knnvec); # number of k (number of nearest neighbor) candidates error.vec <- rep(0,knum); # record number of errors for each k (number of nearest neighbor) names(error.vec) <- knnvec; for(ik in 1:knum) { # calculate error rate for each k (number of nearest neighbor) k <- knnvec[ik]; # number of nearest neighbors temp <- knn.cv(train=X, cl=class, k=k); error.vec[ik] <- sum(temp!=class); # add error number up } kindex <- order(error.vec)[1]; k.optimal <- knnvec[kindex]; if(kindex==1) cat("\nOptimal number of nearest neighbors (",k.optimal,") attains the minimum of the candidates!\n"); if(kindex==length(knnvec)) cat("\nOptimal number of nearest neighbors (",k.optimal,") attains the maximum of the candiates!\n"); list(k.optimal=k.optimal,error.vec=error.vec); } knnvec <- seq(1, 21, by=2) # number of neighbors k, in increasing order temp <- Knncv.leaveoneout.self(X=train.X, class=train.Direction, knnvec=knnvec) k.optimal <- temp$k.optimal # 11 # An Application to Caravan Insurance Data (The Insurance Company (TIC) Benchmark) dim(Caravan) #[1] 5822 86 attach(Caravan) summary(Purchase) 348/5822 standardized.X=scale(Caravan[,-86]) var(Caravan[,1]) var(Caravan[,2]) var(standardized.X[,1]) var(standardized.X[,2]) test=1:1000 train.X=standardized.X[-test,] test.X=standardized.X[test,] train.Y=Purchase[-test] test.Y=Purchase[test] set.seed(1) knn.pred=knn(train.X,test.X,train.Y,k=1) mean(test.Y!=knn.pred) # 0.118 mean(test.Y!="No") # 0.059 table(knn.pred,test.Y) 9/(68+9) knn.pred=knn(train.X,test.X,train.Y,k=3) table(knn.pred,test.Y) 5/26 knn.pred=knn(train.X,test.X,train.Y,k=5) table(knn.pred,test.Y) 4/15