###### R code for the paper ###### "Sliced Inverse Moment Regression Using Weighted Chi-Squared Tests for Dimension Reduction", ###### Journal of Statistical Planning and Inference, Vol. 140, No. 11, 3121-3131. ###### using R package "dr" ###### construct weighted ASAVE and corresponding Chisquare test ### need to install the "dr" package from http://cran.r-project.org/ ### load the package library("dr") ### generate simulated data ### y=2*x1*epsilon+x2^2+x3 ### (x1,x2,x3,x4,epsilon) iid ~ N(0,I) ### sample size: 400 nNum<-400 Xm<-rep(0,nNum*5) ## 400*(X1,X2,X3,X4,epsilon) dim(Xm)<-c(nNum,5) #for(i in 1:5) Xm[,i]<-rnorm(nNum); ## generate Xm randomly # read Xm from "onesample.dat" temp <- read.table("onesample.dat") for(i in 1:nNum) for(j in 1:5) Xm[i,j] <- temp[i,j]; Y<-rep(0,nNum) Y<-2*Xm[,1]*Xm[,5]+Xm[,2]^2+Xm[,3]; Xm0<-Xm[,1:4] ## backup data Y0<-Y X1<-Xm[,1] X2<-Xm[,2] X3<-Xm[,3] X4<-Xm[,4] #### self-defined function for non-negative matrix decomposition #### A=P^2, return P #### idea: A=UDV^T, P=U*sqrt(D)*V^T (U=V if A=A^T) AAself<-function(A) { ## A: p*p non-negative matrix p<-dim(A)[1]; Asvd<-svd(A); D<-matrix(0,p,p); for(i in 1:p) { D[i,i]<-Asvd$d[i]; if(D[i,i]<0) {cat("\n Not non-negative matrix!\n");return(-1);} D[i,i]<-sqrt(D[i,i]); } P<-Asvd$u%*%D%*%t(Asvd$v); return(P); } #### self-defined function for non-negative matrix decomposition #### A=P^2, return P^{-1}=A^{-1/2} #### idea: A=UDV^T, P^{-1}=U*sqrt(1/D)*V^T (U=V if A=A^T) A1self<-function(A) { ## A: p*p non-negative matrix p<-dim(A)[1]; Asvd<-svd(A); D<-matrix(0,p,p); for(i in 1:p) { D[i,i]<-Asvd$d[i]; if(D[i,i]<0) {cat("\n Not non-negative matrix!\n");return(-1);} D[i,i]<-sqrt(1/D[i,i]); } P<-Asvd$u%*%D%*%t(Asvd$v); return(P); } #### self-defined permutation test ## dealing with self-defined SAVEfit, ASAVEfit objects permutestself<-function(object, npermute=50, numdir=object$numdir) { call<-object$call; x<-object$x; n<-dim(x)[1]; xm<-apply(x,2,mean); for(i in 1:n) x[i,]<-x[i,]-xm; x<-x%*%object$evectors; nd <- min(numdir, length(which(abs(object$evalues)>1e-08))-1); nt <- nd + 1; obstest<-rep(0,nt); for(i in 1:nt) obstest[i]<-n*sum(object$evalues[i:nt]); count <- rep(0, nt); val <- rep(0, nt); for (j in 1:npermute) { perm <- sample(1:object$cases); for (col in 0:nd) { call$x<-if (col == 0) x[perm, ] else cbind(x[, (1:col)], x[perm, -(1:col)]); iperm <- eval(call); val[col + 1] <- n*sum(iperm$evalues[(col+1):nt]); } count[val > obstest] <- count[val > obstest] + 1 } pval <- (count)/(npermute + 1) ans1 <- data.frame(cbind(obstest, pval)) dimnames(ans1) <- list(paste(0:(nt - 1), "D vs >= ", 1:nt,"D", sep = ""), c("Stat", "p-value")) ans <- list(summary = ans1, npermute = npermute) class(ans) <- "dr.permutation.test.self" ans } ################ SIMR, Sliced Inverse Moment Regression ##### M^alpha_ASAVE = alpha * M_SIR + (1-alpha) * M_zz'|y ##### 0 <= alpha <= 1 ## need functions: dr.slices in "dr" package ## output: eigenvalues, eigenvectors,etc Wasaveself<-function(y,x,slice=NULL,alpha) { # n: sample size; p: number of explanatory variables # y: response, n*1; # x: explanatory variables, n*p # slice: number of slices # alpha: 0 <= alpha <= 1 n<-length(y); p<-dim(x)[2]; z<-x; for(i in 1:p) z[,i]<-x[,i]-mean(x[,i]); sigxx<-t(z)%*%z/n; ## Sigma_xx sigxx2<-A1self(sigxx); ## Sigma^{-1/2} z<-z%*%sigxx2; ## standardized z slice<-if(!is.null(slice)) slice else max(8,NCOL(z)+3); slices<-dr.slices(y,slice); slice<-slices$nslices; ## modify real #slice due to discontinuity of y ASAVE<-matrix(0,p,p); ## estimated candidate matrix id<-diag(rep(1,p)); ## identity matrix for(i in 1:slice) { zh<-z[slices$slice.indicator==i,]; nh<-slices$slice.sizes[i]; muh<-apply(zh,2,mean); ## estimated E(z|y in I_h) zzave<-t(zh)%*%zh/nh; ## estimated E(zz'|y in I_h) ASAVE<-ASAVE+((1-alpha)*(id-zzave)%*%(id-zzave)+alpha*muh%*%t(muh))*nh/n; } asaveeigen<-eigen(ASAVE); eigenvalues<-asaveeigen$values; eigenvectors<-sigxx2%*%asaveeigen$vectors; for(i in 1:p) eigenvectors[,i]<-eigenvectors[,i]/sqrt(sum(eigenvectors[,i]^2)); ans<-list(x=x,y=y,call=match.call(),cases=n,evectors=eigenvectors,evalues=eigenvalues,slice.info=slices,numdir=p); ans; } ## Example wasavefit05<-Wasaveself(Y,Xm[,1:4],slice=8,alpha=0.5) wasavefit05$evectors;wasavefit05$evalues;wasavefit05$slice.info #### Extended Chi-squared test for SIMR Wchisqself<-function(y,x,slice=NULL,alpha) { # y: response, n*1; # x: explanatory variables, n*p # slice: number of slices # 0 <= alpha <= 1 n<-length(y); p<-dim(x)[2]; if(dim(x)[1]!=n) { cat("\n Dimensions don't match!"); return(-1); } if((alpha<0)||(alpha>1)) { cat("\n Alpha should be between 0 and 1!"); return(-1); } z<-x; ## z mux<-apply(x,2,mean); ## mu_x for(i in 1:p) z[,i]<-x[,i]-mux[i]; sigxx<-t(z)%*%z/n; ## sigma_xx sigxx2<-A1self(sigxx); ## sigma^{-1/2} z<-z%*%sigxx2; ## standardized z slice<-if(!is.null(slice)) slice else max(8,NCOL(x)+3); slices<-dr.slices(y,slice); slice<-slices$nslices; ## modify real #slice due to discontinuity of y nh<-slices$slice.sizes; ## number of observations in slices fh<-nh/n; ## f_h muxh<-matrix(0,slice,p); ## mu_h(x) sigxh<-rep(0,p*p*slice) ## sig_h(x,x) dim(sigxh)<-c(p,p,slice); muxxh<-sigxh; ## mu_h(xx') sigxx21<-rep(0,p*p*p*slice); ## sig_h(xx',x) dim(sigxx21)<-c(p*p,p,slice); sigxx22<-rep(0,p*p*p*p*slice); ## sig_h(xx',xx') dim(sigxx22)<-c(p*p,p*p,slice); for(i in 1:slice) { xtemp<-x[slices$slice.indicator==i,]; muxh[i,]<-apply(xtemp,2,mean); muxxh[,,i]<-t(xtemp)%*%xtemp/nh[i]; xxtemp<-matrix(0,nh[i],p*p); ## xx'-mu_h(xx') for(j in 1:nh[i]) xxtemp[j,]<-as.vector(xtemp[j,]%*%t(xtemp[j,])-muxxh[,,i]); for(j in 1:nh[i]) xtemp[j,]<-xtemp[j,]-muxh[i,]; sigxh[,,i]<-t(xtemp)%*%xtemp/nh[i]; sigxx21[,,i]<-t(xxtemp)%*%xtemp/nh[i]; sigxx22[,,i]<-t(xxtemp)%*%xxtemp/nh[i]; } delta0<-matrix(0,p*(p*slice+slice+1),p*(p*slice+slice+1)); ## Delta_0 for(h in 1:slice) { itemp<-p*p*(h-1); delta0[(itemp+1):(itemp+p*p),(itemp+1):(itemp+p*p)]<-sigxx22[,,h]/fh[h]; itemp<-p*p*slice+p*(h-1); delta0[(itemp+1):(itemp+p),(itemp+1):(itemp+p)]<-sigxh[,,h]/fh[h]; jtemp<-p*p*(h-1); delta0[(itemp+1):(itemp+p),(jtemp+1):(jtemp+p*p)]<-t(sigxx21[,,h])/fh[h]; delta0[(jtemp+1):(jtemp+p*p),(itemp+1):(itemp+p)]<-sigxx21[,,h]/fh[h]; itemp<-p*(p*slice+slice); delta0[(itemp+1):(itemp+p),(jtemp+1):(jtemp+p*p)]<-t(sigxx21[,,h]); delta0[(jtemp+1):(jtemp+p*p),(itemp+1):(itemp+p)]<-sigxx21[,,h]; jtemp<-p*p*slice+p*(h-1); delta0[(itemp+1):(itemp+p),(jtemp+1):(jtemp+p)]<-sigxh[,,h]; delta0[(jtemp+1):(jtemp+p),(itemp+1):(itemp+p)]<-sigxh[,,h]; } itemp<-p*(p*slice+slice); delta0[(itemp+1):(itemp+p),(itemp+1):(itemp+p)]<-sigxx; Mn<-t(muxh); ## M_n gvec<-matrix(0,slice*p*(p+1),p*(slice*(p+1)+1)); ## \dot g[vec(O,M,mu_x)] gvec[1:(p*p*slice),1:(p*p*slice)]<-diag(rep(1,p*p*slice)); itemp<-p*p*slice; gvec[(itemp+1):(itemp+p*slice),(itemp+1):(itemp+p*slice)]<-diag(rep(1,p*slice)); mtemp<--diag(rep(1,slice))%x%(as.matrix(mux))%x%diag(rep(1,p))-diag(rep(1,p*slice))%x%(as.matrix(mux)); gvec[1:(p*p*slice),(itemp+1):(itemp+p*slice)]<-mtemp; mtemp<--(as.matrix(as.vector(Mn)))%x%diag(rep(1,p)); for(h in 1:slice) { mtemp[(p*p*(h-1)+1):(p*p*h),]<-mtemp[(p*p*(h-1)+1):(p*p*h),]-diag(rep(1,p))%x%(as.matrix(muxh[h,])); } jtemp<-p*slice*(p+1); gvec[1:(p*p*slice),(jtemp+1):(jtemp+p)]<-mtemp; delta<-gvec%*%delta0%*%t(gvec); ## Delta temp<-matrix(0,p*(p+1)*slice,p*(p+1)*slice); diag(temp)[1:(p^2*slice)]<-sqrt(1-alpha); diag(temp)[(p^2*slice+1):(p*(p+1)*slice)]<-sqrt(alpha); delta<-temp %*% delta %*% temp; ## Delta^alpha fm<-diag(rep(1,slice))-fh%*%t(rep(1,slice)); ## F gm<-diag(sqrt(fh)); ## G fgm<-fm%*%gm; ## FG muzh<-matrix(0,slice,p); ## mu_h(z) muzzh<-rep(0,p*p*slice); ## mu_h(zz') dim(muzzh)<-c(p,p,slice); um<-matrix(0,p,slice*(p+1)); ## U_n for(h in 1:slice) { ztemp<-z[slices$slice.indicator==h,]; muzh[h,]<-apply(ztemp,2,mean); muzzh[,,h]<-t(ztemp)%*%ztemp/nh[h]; itemp<-p*(h-1); um[,(itemp+1):(itemp+p)]<-(muzzh[,,h]-diag(rep(1,p)))*sqrt(fh[h]); um[,p*slice+h]<-muzh[h,]*sqrt(fh[h]); } temp<-matrix(0,(p+1)*slice,(p+1)*slice); diag(temp)[1:(p*slice)]<-sqrt(1-alpha); diag(temp)[(p*slice+1):((p+1)*slice)]<-sqrt(alpha); um<-um %*% temp; ## U^alpha_n umsvd<-svd(um,nu=p,nv=slice*(p+1)); ## UDV' gamma1<-umsvd$u; ## (Gamma_11,Gamma_12) gamma2<-umsvd$v; ## (Gamma_21,Gamma_22) dv<-umsvd$d; ## D value<-matrix(0,p,4); ## output: (d,Lambda_d, d.f., p-value) colnames(value)<-c("d","Stat","d.f.","p-value"); for(d in 0:(p-1)) { gamma12<-gamma1[,(d+1):p]; ## Gamma_12 gamma22<-gamma2[,(d+1):(slice*(p+1))]; ## Gamma_22 tempm<-matrix(0,slice*(p+1),slice*(p+1)); itemp<-slice*p; tempm[1:itemp,1:itemp]<-fgm%x%sigxx2; ## FG(*)Sig_x^(-1/2) tempm[(itemp+1):(itemp+slice),(itemp+1):(itemp+slice)]<-fgm; ## FG tempm<-tempm%*%gamma22; tempm<-tempm%x%t(t(gamma12)%*%sigxx2); wm<-t(tempm)%*%delta%*%tempm; ## W^alpha_n tn<-round(sum(diag(wm))^2/sum(diag(wm%*%wm))); ## tn lambdad<-n*sum(dv[(d+1):p]^2); ## Lambda_d lambdad<-lambdad*tn/sum(diag(wm)); value[d+1,1]<-d; value[d+1,2]<-lambdad; value[d+1,3]<-tn; value[d+1,4]<-1-pchisq(lambdad,tn); ## p-value } value; } #Wchisqself(Y,Xm[,1:4],slice=8,alpha=0.5) ## function to compute the vector correlation coefficient q (Hotlling 1936) ## and the trace correlation r (Hooper 1959), see Ye&Weiss(2003) ## input: A (nt*nm), B (nt*nl), Note: rank(A)=nm, rank(B)=nl ## ID=(A'A)^{-1}A'B(B'B)^{-1}B'A, (2.12, page 247, Hooper 1959) ## q^2=|ID|, r^2=trace(ID)/nm ## output: (1-r), (1-q), arccos(q) vcorrelation <- function(A, B) { nt <- dim(A)[1]; nm <- dim(A)[2]; if(dim(B)[1]!=nt) stop("dimensions do not match"); nl <- dim(B)[2]; ID <- solve(t(A)%*%A) %*% t(A) %*% B %*% solve(t(B)%*%B) %*% t(B) %*% A; Q <- sqrt(det(ID)); # q arcq <- acos(min(Q,1-1e-17))*180/pi; # arccos(q) in degrees, not radians Q <- max(1 - Q, 0); # 1-q R <- 1 - sqrt(mean(diag(ID))); # 1-r list(R=R, Q=Q, arcq=arcq); } # Example A=savefit$evectors[,1:4] B=diag(c(1,1,1,1))[,1:4] vcorrelation(A,B) vcorrelation(B,A) vcorrelation(A,A) vcorrelation(B,B) ### Analyze the simulated data "onesample.dat" ### y=2*x1*epsilon+x2^2+x3 ### (x1,x2,x3,x4,epsilon) iid ~ N(0,I) ### sample size: 400 ########## number of slices used for "onesample.dat nslice=10 ########## # SIR cat("\n### SIR:\n") sirfit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="sir") print(round(sirfit$evectors,3)) print(round(summary(sirfit)$test,3)) print(summary(sirfit)$test[,3]) # r-pHd cat("### r-pHd:\n") rphdfit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="phdres") print(round(rphdfit$evectors,3)) print(round(summary(rphdfit)$test,3)) print(summary(rphdfit)$test[,3]) # y-pHd cat("\n### y-pHd:\n") yphdfit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="phdy") print(round(yphdfit$evectors,3)) print(round(summary(yphdfit)$test,3)) print(summary(yphdfit)$test[,3]) # ire cat("\n### ire:\n") irefit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="ire") print(round(summary(irefit)$result[[3]]$B,3)) print(summary(irefit)$test) print(summary(irefit)$test[,3]) # SAVE cat("\n### SAVE:\n") savefit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="save") print(round(savefit$evectors,3)) print(round(summary(savefit)$test,3)) print(summary(savefit)$test[,3]) # SIMR alphavec<-c(0,0.01,0.05,seq(0.1,0.9,by=0.1),0.95,0.99,1) anum<-length(alphavec) for(ia in 1:anum) { wasavefit<-Wasaveself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]); cat("\n### SIMR, alpha=",alphavec[ia],":\n") print(round(wasavefit$evectors,3)) print(round(Wchisqself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]),3)); } for(ia in 1:anum) { wasavefit<-Wasaveself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]); cat("\n### SIMR, alpha=",alphavec[ia],":\n") print(round(wasavefit$evectors,3)) print(round(Wchisqself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]),4)); } ## choose optimal alpha by comparing with the truth alphavec<-c(0,0.01,0.05,seq(0.1,0.9,by=0.1),0.95,0.99,1) anum<-length(alphavec) qvec=rvec=aqvec=rep(0,anum) d05vec=d01vec=rep(0,anum) # dimension based on level 0.05, 0.01 B=diag(c(1,1,1,1))[,1:3] for(ia in 1:anum) { wasavefit<-Wasaveself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]); A=wasavefit$evectors[,1:3] temp=vcorrelation(A,B) qvec[ia]=temp$Q rvec[ia]=temp$R aqvec[ia]=temp$arcq temp=Wchisqself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]) for(i in 1:4) { if(temp[i,4]<0.05) d05vec[ia]<-temp[i,1]+1; if(temp[i,4]<0.01) d01vec[ia]<-temp[i,1]+1; } } cat("\nOptimal alpha according to Q: ",alphavec[order(qvec)[1]],", q=",min(qvec),"\n") cat("\nOptimal alpha according to R: ",alphavec[order(rvec)[1]],", r=",min(rvec),"\n") cat("\nOptimal alpha according to arccos(Q): ",alphavec[order(aqvec)[1]],", arcq=",min(aqvec),"\n") cat("\n") ## Results: #Optimal alpha according to Q: 0.6 , q= 0.01577232 #Optimal alpha according to R: 0.6 , r= 0.005229655 #Optimal alpha according to arccos(Q): 0.6 , arcq= 10.18962 ## Plot output temp=log(cbind(qvec,rvec,aqvec*pi/180)) matplot(c(0,1),c(min(temp),max(temp)),type="n",xlab=expression(alpha),ylab="log scale") points(alphavec, log(qvec), type="l", lty=1, lwd=2) points(alphavec, log(rvec), type="l", lty=2, lwd=2) points(alphavec,log(aqvec*pi/180), type="l",lty=3, lwd=2) abline(v=alphavec[order(qvec)[1]],lty=4) points(alphavec, -d05vec, type="l", lty=2, lwd=1) points(alphavec, -d01vec, type="l", lty=3, lwd=1) legend(0.5,max(temp),legend=c("1-q","1-r","arccos(q)","0.05","0.01",expression(paste(alpha,"=0.49"))),lty=c(1,2,3,2,3,4),lwd=c(2,2,2,1,1,1),cex=0.9) ## check dimension according to SAVE for(k in 1:4) { A=as.matrix(savefit$evectors[,1:k]) B=as.matrix(diag(c(1,1,1,1))[,1:k]) temp=vcorrelation(A,B) if(k==1) for(i in 2:3) { B=as.matrix(diag(c(1,1,1,1))[,i]) temp1=vcorrelation(A,B) if(temp1$Q < temp$Q) temp=temp1; } if(k==2) for(i in 1:2) for(j in (i+1):3) { B=as.matrix(diag(c(1,1,1,1))[,c(i,j)]) temp1=vcorrelation(A,B) if(temp1$Q < temp$Q) temp=temp1; } print(c(temp$Q,temp$R,temp$arcq)) } ## Results: #k=1 0.001033281 0.001033281 2.604859013 #k=2 0.01564951 0.00779395 10.14976395 #k=3 0.19136239 0.05945144 36.03696455 #k=4 0 0 0 B=as.matrix(diag(c(1,1,1,1))[,1:3]) # SIR cat("\n### SIR:\n") A=sirfit$evectors[,1:3] temp=vcorrelation(A,B) print(c(temp$Q,temp$R,temp$arcq)) #[1] 0.5579777 0.1445501 63.7670186 # r-pHd cat("### r-pHd:\n") A=rphdfit$evectors[,1:3] temp=vcorrelation(A,B) print(c(temp$Q,temp$R,temp$arcq)) #[1] 0.27416824 0.08224336 43.46191537 # y-pHd cat("\n### y-pHd:\n") A=yphdfit$evectors[,1:3] temp=vcorrelation(A,B) print(c(temp$Q,temp$R,temp$arcq)) #[1] 0.31862461 0.09367548 47.04878514 # ire cat("\n### ire:\n") A=summary(irefit)$result[[3]]$B temp=vcorrelation(A,B) print(c(temp$Q,temp$R,temp$arcq)) #[1] 0.5124790 0.1363495 60.8222277 ## find optimal alpha by checking variation on dimensions and subspace distance ## need to check for each alpha, (1) dimension difference based on 0.05, 0.01 ## (2) subspace difference on the same dimension alphavec<-c(0,0.01,0.05,seq(0.1,0.9,by=0.1),0.95,0.99,1) anum<-length(alphavec) d05vec=d01vec=rep(0,anum) # dimension based on level 0.05, 0.01 qvec=rvec=aqvec=rep(0,anum) B=diag(c(1,1,1,1))[,1:3] pvec=rep(0,4*anum) # p-values dim(pvec)=c(4,anum) evectors <- rep(0, 4*4*anum) # eigenvectors based on different alpha dim(evectors) <- c(4,4,anum) for(ia in 1:anum) { # loops for dimension & evectors based on original data wasavefit<-Wasaveself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]); A=wasavefit$evectors[,1:3] temp=vcorrelation(A,B) qvec[ia]=temp$Q rvec[ia]=temp$R aqvec[ia]=temp$arcq evectors[,,ia]=wasavefit$evectors; temp=Wchisqself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]); pvec[,ia]=temp[,4]; for(i in 1:4) { if(temp[i,4]<0.05) d05vec[ia]<-temp[i,1]+1; if(temp[i,4]<0.01) d01vec[ia]<-temp[i,1]+1; } } temp=cbind(qvec,rvec,aqvec*pi/180,log(t(pvec))) par(mfrow=c(1,1)) matplot(c(0,1),c(min(temp),max(temp)),type="n",xlab=expression(alpha),ylab="log scale") abline(v=alphavec[order(rvec)[1]],lty=4) points(alphavec, -d05vec, type="l", lty=2, lwd=1) points(alphavec, -d01vec, type="l", lty=3, lwd=1) points(alphavec, log(pvec[1,]), type="l", lty=3, lwd=2, col="red") points(alphavec, log(pvec[2,]), type="l", lty=2, lwd=2, col="red") points(alphavec, log(pvec[3,]), type="l", lty=1, lwd=2, col="red") points(alphavec, log(pvec[4,]), type="l", lty=4, lwd=2, col="red") abline(v=alphavec[order(pvec[1,])[1]],lty=3) abline(v=alphavec[order(pvec[2,])[1]],lty=2) abline(v=alphavec[order(pvec[3,])[1]],lty=1) abline(v=alphavec[order(pvec[4,])[1]],lty=4) ### check results based on bootstrap ### note that it may take as long as 15 mins for 200 bootstrap samples bnum=200 # number of bootstraps d05bnum=rep(0,bnum*anum) # dimensions based on bootstrap samples, level=0.05 dim(d05bnum) <- c(bnum, anum) d01bnum=d05bnum # dimensions based on bootstrap samples, level=0.01 qvecbnum=rep(0,bnum*4*anum) # q-distance, bootstrap*dimension(1,2,3,4) dim(qvecbnum)=c(bnum,4,anum) rvecbnum=qvecbnum # r-distance, bootstrap*dimension(1,2,3,4) aqvecbnum=qvecbnum # arccos(q)-distance, bootstrap*dimension(1,2,3,4) set.seed(123) # Set random seed ttemp=proc.time() for(ib in 1:bnum) { # loops begin for bootstrapping cat("\nib=",ib,"\n") bindex=sample(seq(1:length(Y)), replace=T) Y1=Y[bindex] Xm1=Xm[bindex,] for(ia in 1:anum) { A<-Wasaveself(Y1,Xm1[,1:4],slice=nslice,alpha=alphavec[ia])$evectors; B<-evectors[,,ia]; for(i in 1:4) { temp=vcorrelation(as.matrix(A[,1:i]),as.matrix(B[,1:i])); qvecbnum[ib,i,ia]=temp$Q; rvecbnum[ib,i,ia]=temp$R; aqvecbnum[ib,i,ia]=temp$arcq; } temp=Wchisqself(Y1,Xm1[,1:4],slice=nslice,alpha=alphavec[ia]) for(i in 1:4) { if(temp[i,4]<0.05) d05bnum[ib,ia]<-temp[i,1]+1; if(temp[i,4]<0.01) d01bnum[ib,ia]<-temp[i,1]+1; } } # end of ia loops } # end of ib loops proc.time()-ttemp # count the time cost # user system elapsed # 823.89 0.59 833.75 # check dimensions dimse05 <- rep(0,anum) # dimension standard error, level=0.05 dimse01 <- rep(0,anum) # dimension standard error, level=0.01 for(ia in 1:anum) { dimse05[ia]=mean(abs(d05bnum[,ia]-d05vec[ia])); dimse01[ia]=mean(abs(d01bnum[,ia]-d01vec[ia])); } par(mfrow=c(1,1)) matplot(c(0,1),c(0,3),xlab=expression(alpha),ylab="variation of dimensions",type="n") points(alphavec,dimse05,type="b",pch=21) points(alphavec,dimse01,type="b",pch=22) par(mfrow=c(1,1)) plot(c(0,1),c(1,4),type="n") for(ib in 1:bnum) points(alphavec,d05bnum[ib,]+rnorm(1)*0.05,type="l",lty=2); points(alphavec,d05vec,type="l",lty=1,lwd=2) # check eigenvectors qvec=rvec=aqvec=rep(0,anum) B=diag(c(1,1,1,1))[,1:3] for(ia in 1:anum) { wasavefit<-Wasaveself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]); A=wasavefit$evectors[,1:3] temp=vcorrelation(A,B) qvec[ia]=temp$Q rvec[ia]=temp$R aqvec[ia]=temp$arcq temp=Wchisqself(Y,Xm[,1:4],slice=nslice,alpha=alphavec[ia]) for(i in 1:4) { if(temp[i,4]<0.05) d05vec[ia]<-temp[i,1]+1; if(temp[i,4]<0.01) d01vec[ia]<-temp[i,1]+1; } } plot(alphavec, qvec, type="l", lty=1) points(alphavec,rvec, type="l",lty=2) points(alphavec,aqvec*pi/180, type="l",lty=3) cat("\nOptimal alpha according to Q: ",alphavec[order(qvec)[1]]) cat("\nOptimal alpha according to R: ",alphavec[order(rvec)[1]]) cat("\nOptimal alpha according to arccos(Q): ",alphavec[order(aqvec)[1]]) cat("\n") # 0.6 abline(v=alphavec[order(qvec)[1]]) points(alphavec, d05vec/10, type="l", lty=2, col="red", lwd=2) points(alphavec, d01vec/10, type="l", lty=2, col="blue", lwd=2) ## plot eigenvector results temps=log(rvecbnum[,1:3,]) par(mfrow=c(2,3)) for(i in 1:3) { temp=as.data.frame(temps[,i,]); colnames(temp)=alphavec; boxplot(temp,ylim=c(-10,0)); title(i); } matplot(c(0,1),c(-5,0),type="n",xlab=expression(alpha),ylab="log scale",main="Median of log(1-r)") points(alphavec,apply(temps[,3,],2,median),type="b",lty=1,pch=21) points(alphavec,apply(temps[,2,],2,median),type="b",lty=2,pch=22) points(alphavec,apply(temps[,1,],2,median),type="b",lty=3,pch=23) alphavec[order(apply(temps[,3,],2,median))] legend(0,0,legend=c("dim=1","dim=2","dim=3"),lty=c(3,2,1),pch=c(23,22,21)) matplot(c(0,1),c(-5,0),type="n",xlab=expression(alpha),ylab="log scale",main="Mean of log(1-r)") points(alphavec,apply(temps[,3,],2,mean),type="b",lty=1,pch=21) points(alphavec,apply(temps[,2,],2,mean),type="b",lty=2,pch=22) points(alphavec,apply(temps[,1,],2,mean),type="b",lty=3,pch=23) legend(0,0,legend=c("dim=1","dim=2","dim=3"),lty=c(3,2,1),pch=c(23,22,21)) matplot(c(0,1),c(0,0.5),type="n",xlab=expression(alpha),ylab="log scale",main="Mean of (1-r)") points(alphavec,apply(exp(temps[,3,]),2,mean),type="b",lty=1,pch=21) points(alphavec,apply(exp(temps[,2,]),2,mean),type="b",lty=2,pch=22) points(alphavec,apply(exp(temps[,1,]),2,mean),type="b",lty=3,pch=23) legend(0,0.5,legend=c("dim=1","dim=2","dim=3"),lty=c(3,2,1),pch=c(23,22,21)) ######## check the Ozone data, ozone, Hgt, Hum, InvTmp, Temp ## need to install the "gclus" package from http://cran.r-project.org/ ## load the "gclus" package library("gclus") data(ozone) Ozone<-ozone$Ozone Height<-ozone$Hgt Humidity<-ozone$Hum ITemp<-ozone$InvTmp STemp<-ozone$Temp X<-cbind(Height,Humidity^1.68,ITemp^1.25,STemp^1.11) Y<-Ozone X1<-X[,1] X2<-X[,2] X3<-X[,3] X4<-X[,4] nNum<-length(Y) ########## number of slices used for Ozone data nslice=8 ########## for(i in 1:1) { # SIR cat("\n### SIR:\n") sirfit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="sir") print(round(sirfit$evectors,3)) print(round(summary(sirfit)$test,3)) print(summary(sirfit)$test[,3]) # r-pHd cat("### r-pHd:\n") rphdfit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="phdres") print(round(rphdfit$evectors,3)) print(round(summary(rphdfit)$test,3)) print(summary(rphdfit)$test[,3]) # y-pHd cat("\n### y-pHd:\n") yphdfit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="phdy") print(round(yphdfit$evectors,3)) print(round(summary(yphdfit)$test,3)) print(summary(yphdfit)$test[,3]) # ire cat("\n### ire:\n") irefit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="ire") print(round(summary(irefit)$result[[3]]$B,3)) print(summary(irefit)$test) print(summary(irefit)$test[,3]) # SAVE cat("\n### SAVE:\n") savefit<-dr(Y~X1+X2+X3+X4, nslices=nslice, method="save") print(round(savefit$evectors,3)) print(round(summary(savefit)$test,3)) print(summary(savefit)$test[,3]) # SIMR alphavec<-c(0,0.01,0.05,seq(0.1,0.9,by=0.1),0.95,0.99,1) anum<-length(alphavec) for(ia in 1:anum) { wasavefit<-Wasaveself(Y,X,slice=nslice,alpha=alphavec[ia]); cat("\n### SIMR, alpha=",alphavec[ia],":\n") print(round(wasavefit$evectors,3)) print(round(Wchisqself(Y,X,slice=nslice,alpha=alphavec[ia]),3)); } for(ia in 1:anum) { wasavefit<-Wasaveself(Y,X,slice=nslice,alpha=alphavec[ia]); cat("\n### SIMR, alpha=",alphavec[ia],":\n") print(round(wasavefit$evectors,3)) print(round(Wchisqself(Y,X,slice=nslice,alpha=alphavec[ia]),4)); } } # find optimal alpha by checking variation on dimensions and subspace distance # need to check for each alpha, (1) dimension difference based on 0.05, 0.01 # (2) subspace difference on the same dimension alphavec<-c(0,0.01,0.05,seq(0.1,0.9,by=0.1),0.95,0.99,1) anum<-length(alphavec) d05vec=d01vec=rep(0,anum) # dimension based on level 0.05, 0.01 pvec=rep(0,4*anum) # p-values dim(pvec)=c(4,anum) evectors <- rep(0, 4*4*anum) # eigenvectors based on different alpha dim(evectors) <- c(4,4,anum) for(ia in 1:anum) { # loops for dimension & evectors based on original data wasavefit<-Wasaveself(Y,X,slice=nslice,alpha=alphavec[ia]); evectors[,,ia]=wasavefit$evectors; temp=Wchisqself(Y,X,slice=nslice,alpha=alphavec[ia]); pvec[,ia]=temp[,4]; for(i in 1:4) { if(temp[i,4]<0.05) d05vec[ia]<-temp[i,1]+1; if(temp[i,4]<0.01) d01vec[ia]<-temp[i,1]+1; } } ## check results based on bootstrap bnum=200 # number of bootstraps d05bnum=rep(0,bnum*anum) # dimensions based on bootstrap samples, level=0.05 dim(d05bnum) <- c(bnum, anum) d01bnum=d05bnum # dimensions based on bootstrap samples, level=0.01 qvecbnum=rep(0,bnum*4*anum) # q-distance, bootstrap*dimension(1,2,3,4) dim(qvecbnum)=c(bnum,4,anum) rvecbnum=qvecbnum # r-distance, bootstrap*dimension(1,2,3,4) aqvecbnum=qvecbnum # arccos(q)-distance, bootstrap*dimension(1,2,3,4) set.seed(123) # Set random seed ttemp=proc.time() for(ib in 1:bnum) { # loops begin for bootstrapping cat("\nib=",ib,"\n") bindex=sample(seq(1:length(Y)), replace=T) Y1=Y[bindex] Xm1=X[bindex,] for(ia in 1:anum) { A<-Wasaveself(Y1,Xm1[,1:4],slice=nslice,alpha=alphavec[ia])$evectors; B<-evectors[,,ia]; for(i in 1:4) { temp=vcorrelation(as.matrix(A[,1:i]),as.matrix(B[,1:i])); qvecbnum[ib,i,ia]=temp$Q; rvecbnum[ib,i,ia]=temp$R; aqvecbnum[ib,i,ia]=temp$arcq; } temp=Wchisqself(Y1,Xm1[,1:4],slice=nslice,alpha=alphavec[ia]) for(i in 1:4) { if(temp[i,4]<0.05) d05bnum[ib,ia]<-temp[i,1]+1; if(temp[i,4]<0.01) d01bnum[ib,ia]<-temp[i,1]+1; } } # end of ia loops } # end of ib loops proc.time()-ttemp # user system elapsed # 652.67 0.42 654.26 # check dimensions dimse05 <- rep(0,anum) # dimension standard error, level=0.05 dimse01 <- rep(0,anum) # dimension standard error, level=0.01 for(ia in 1:anum) { dimse05[ia]=mean(abs(d05bnum[,ia]-d05vec[ia])); dimse01[ia]=mean(abs(d01bnum[,ia]-d01vec[ia])); } par(mfrow=c(1,1)) matplot(c(0,1),c(0,3),xlab=expression(alpha),ylab="variation of dimensions",type="n") points(alphavec,dimse05,type="b",pch=21) points(alphavec,dimse01,type="b",pch=22) par(mfrow=c(1,1)) plot(c(0,1),c(1,4),type="n") for(ib in 1:bnum) points(alphavec,d05bnum[ib,]+rnorm(1)*0.05,type="l",lty=2); points(alphavec,d05vec,type="l",lty=1,lwd=2) #plot the results temps=log(rvecbnum[,1:3,]) par(mfrow=c(1,3)) matplot(c(0,1),c(-14,0),type="n",xlab=expression(alpha),ylab="mean of log(1-r)") points(alphavec,apply(temps[,3,],2,mean),type="b",lty=1,pch=21,lwd=2) points(alphavec,apply(temps[,2,],2,mean),type="b",lty=2,pch=22,lwd=2) points(alphavec,apply(temps[,1,],2,mean),type="b",lty=3,pch=23,lwd=2) abline(v=alphavec[order(apply(temps[,3,],2,mean))[1]],lty=1) #0.60 legend(0,0,legend=c("d=1","d=2","d=3"),lty=c(3,2,1),pch=c(23,22,21)) matplot(c(0,1),c(-14,0),type="n",xlab=expression(alpha),ylab="log mean of (1-r)") points(alphavec,log(apply(exp(temps[,3,]),2,mean)),type="b",lty=1,pch=21,lwd=2) points(alphavec,log(apply(exp(temps[,2,]),2,mean)),type="b",lty=2,pch=22,lwd=2) points(alphavec,log(apply(exp(temps[,1,]),2,mean)),type="b",lty=3,pch=23,lwd=2) abline(v=alphavec[order(apply(exp(temps[,3,]),2,mean))[1]],lty=1) #0.60 legend(0,0,legend=c("d=1","d=2","d=3"),lty=c(3,2,1),pch=c(23,22,21),lwd=c(2,2,2)) ## classify Ozone data si4=0.602*X1-0.502*X2-0.597*X3+0.169*X4 sa4=-0.124*X1-0.026*X2-0.143*X3+0.981*X4 sirX=X%*%sirfit$evectors phdX=X%*%rphdfit$evectors saveX=X%*%savefit$evectors simr0=X%*%Wasaveself(Y,X,slice=nslice,alpha=0)$evectors simr2=X%*%Wasaveself(Y,X,slice=nslice,alpha=0.2)$evectors pairs(cbind(sirX,phdX)) # sir1=phd2 #> round(cor(cbind(sirX,phdX)),3) # Dir1 Dir2 Dir3 Dir4 Dir1 Dir2 Dir3 Dir4 #Dir1 1.000 0.000 0.000 0.000 0.030 0.953 -0.042 0.297 #Dir2 0.000 1.000 0.000 0.000 -0.200 0.207 0.795 -0.534 #Dir3 0.000 0.000 1.000 0.000 -0.671 -0.168 0.312 0.651 #Dir4 0.000 0.000 0.000 1.000 0.713 -0.140 0.519 0.451 pairs(cbind(saveX,phdX)) # save2=phd2, save3=phd1 round(cor(cbind(saveX,phdX)),3) # Dir1 Dir2 Dir3 Dir4 Dir1 Dir2 Dir3 Dir4 #Dir1 1.000 0.000 0.000 0.000 0.307 0.278 0.768 -0.488 #Dir2 0.000 1.000 0.000 0.000 -0.003 -0.938 0.139 -0.318 #Dir3 0.000 0.000 1.000 0.000 -0.887 0.020 0.439 0.144 #Dir4 0.000 0.000 0.000 1.000 0.345 -0.206 0.445 0.800 round(cor(cbind(saveX,simr0)),3) round(cor(cbind(saveX,simr2)),3) round(cor(cbind(simr0,simr2)),3)