\documentclass[a4paper]{article}
\usepackage[round]{natbib}
\usepackage{hyperref}
%\usepackage{natbib}
%\VignetteIndexEntry{Classification of Breast Cancer Clinical Stage with Gene Expression Data (without Results)}
%\VignetteDepends{knitr, gdata}
%\VignetteEngine{knitr::knitr}
\newcommand{\R}[1]{\texttt{#1}}
\hypersetup{%
  pdftitle = {Classification of Breast Cancer Clinical Stage with Gene Expression Data},
  pdfsubject = {package vignette},
  pdfauthor = {Zhu Wang},
  %% change colorlinks to false for pretty printing
  colorlinks = {true},
  linkcolor = {blue},
  citecolor = {blue},
  urlcolor = {red},
  hyperindex = {true},
  linktocpage = {true},
}

\author{Zhu Wang \\
  University of Tennessee Health Science Center\\
  zwang145@uthsc.edu}
\title{Classification of Breast Cancer Clinical Stage with Gene Expression Data}
\begin{document}
\sloppy
\setkeys{Gin}{width=0.6\textwidth, keepaspectratio}
\maketitle

This document presents analysis for the MAQC-II project, human breast cancer data set with boosting algorithms developed in \cite{wang2016robust, wang2016robust2} and implemented in R package \R{bst}. 

Dataset comes from the MicroArray Quality Control (MAQC) II project and includes 278 breast cancer samples with 164 estrogen receptor (ER) positive cases. The data files \verb|GSE20194_series_matrix.txt.gz| and \verb|GSE20194_MDACC_Sample_Info.xls| can be downloaded from \url{http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?token=rhojvaiwkcsaihq&acc=GSE20194}. After reading the data, some unused variables are removed. From 22283 genes, the dataset is pre-screened to obtain 3000 genes with the largest absolute values of the two-sample t-statistics. The 3000 genes are standardized. 
<<echo=FALSE,results='hide'>>=
library("knitr")
options(prompt = "R> ", continue = " ", width = 60, useFancyQuotes = FALSE)
opts_chunk$set(tidy=TRUE, highlight=FALSE, background="white")
opts_chunk$set(dev=c('pdf','postscript'), fig.path='cancerfigure/')
@
<<message=FALSE, eval=FALSE>>=
#The data files below were downloaded on June 1, 2016
require("gdata")
bc <- t(read.delim("GSE20194_series_matrix.txt.gz",sep="",header=FALSE,skip=80))
colnames(bc) <- bc[1,]
bc <- bc[-1, -c(1, 2)]
###The last column is empty with variable name !series_matrix_table_end, thus omitted
bc <- bc[, -22284]
mode(bc) <- "numeric" ### convert character to numeric
dat1 <- read.xls("GSE20194_MDACC_Sample_Info.xls", sheet=1, header=TRUE)
y <- dat1$characteristics..ER_status
y <- ifelse(y=="P", 1, -1)
table(y)
res <- rep(NA, dim(bc)[2])
for(i in 1:dim(bc)[2])
res[i] <- abs(t.test(bc[,i]~y)$statistic)
### find 3000 largest absolute value of t-statistic
tmp <- order(res, decreasing=TRUE)[1:3000]
dat <- bc[, tmp]
### standardize variables
dat <- scale(dat)
@
Set up configuration parameters. 
<<message=FALSE>>=
nrun <- 100 
per <- c(0, 0.05, 0.10, 0.15)
learntype <- c("tree", "ls")[2]
tuning <- "error"
n.cores <- 4 
plot.it <- TRUE
### robust tuning parameters used in bst/rbst function
s <- c(0.9, 1.01,0.5, -0.2, 0.8, -0.5, -0.2)
nu <- c(0.01, 0.1, 0.01, rep(0.1, 4))        
m <- 100  ### boosting iteration number
### whether to truncate the predicted values in each boosting iteration?
ctr.trun <- c(TRUE, rep(FALSE, 6))
### used in bst function
bsttype <- c("closs", "gloss", "qloss", "binom", "binom", "hinge", "expo")
### and corresponding labels
bsttype1 <- c("ClossBoost", "GlossBoost", "QlossBoost", "LogitBoost", "LogitBoost", "HingeBoost", "AdaBoost")
### used in rbst function
rbsttype <- c("closs", "gloss", "qloss", "tbinom", "binomd", "thinge", "texpo")
### and corresponding labels
rbsttype1 <- c("ClossBoostQM", "GlossBoostQM", "QlossBoostQM", "TLogitBoost", "DlogitBoost", "THingeBoost", "TAdaBoost")
@
The training data contains randomly selected 50 samples with positive estrogen receptor status and 50 samples with negative estrogen receptor status, and the rest were designated as the test data. The training data is contaminated by randomly switching response variable labels at varying pre-specified proportions \R{per}=\Sexpr{per}. This process is repeated \R{nrun}=\Sexpr{nrun} times. The base learner is \R{learntype}=\Sexpr{learntype} (with quotes). To select optimal boosting iteration from maximum value of \R{m}=\Sexpr{m}, we run five-fold cross-validation averaging classification errors. In cross-validation, we set the number of cores for parallel computing by \R{n.cores}=\Sexpr{n.cores}. Selected results can be plotted if \R{plot.it=TRUE}. Gradient based boosting includes ClossBoost, GlossBoost, QlossBoost, LogitBoost, HingeBoost and AdaBoost. Robust boosting using \R{rbst} contains ClossBoostQM, GlossBoostQM, QlossBoostQM, TLogitBoost, DlogitBoost, THingeBoost and TAdaBoost.
<<cancer, fig.show='hold', eval=FALSE>>=
summary7 <- function(x) c(summary(x), sd=sd(x))
ptm <- proc.time()
for(k in 1:7){   ### k controls which family in bst, and rfamily in rbst 
  err.m1 <- err.m2 <- nvar.m1 <- nvar.m2 <- errbest.m1 <- errbest.m2 <- matrix(NA, ncol=4, nrow=nrun)
  mstopbest.m1 <- mstopbest.m2 <- mstopcv.m1 <- mstopcv.m2 <- matrix(NA, ncol=4, nrow=nrun)
  colnames(err.m1) <- colnames(err.m2) <- c("cont-0%", "cont-5%", "cont-10%", "cont-15%")
  colnames(mstopcv.m1) <- colnames(mstopcv.m2) <- colnames(err.m1)
  colnames(nvar.m1) <- colnames(nvar.m2) <- colnames(err.m1)
  colnames(errbest.m1) <- colnames(errbest.m2) <- colnames(err.m1)
  colnames(mstopbest.m1) <- colnames(mstopbest.m2) <- colnames(err.m1)
  for (ii in 1:nrun){
    set.seed(1000+ii)
    trid <- c(sample(which(y==1))[1:50], sample(which(y==-1))[1:50])
    dtr <- dat[trid, ]
    dte <- dat[-trid, ]
    ytrold <- y[trid]
    yte <- y[-trid]
    ### number of patients/no. variables in training and test data
    dim(dtr)
    dim(dte) 
    ###randomly contaminate data
    ntr <- length(trid)
    set.seed(1000+ii)
    con <- sample(ntr)
    for(j in 1){     ### controls learntype
      for(i in 1:4){    ### i controls how many percentage of data contaminated
        ytr <- ytrold
        percon <- per[i]
        ### randomly flip labels of the samples in training set according to pre-defined contamination level
        if(percon > 0){
          ji <- con[1:(percon*ntr)]
          ytr[ji] <- -ytrold[ji]
        }
        dat.m1 <- bst(x=dtr, y=ytr, ctrl = bst_control(mstop=m, center=FALSE, trace=FALSE, nu=nu[k], s=s[k], trun=ctr.trun[k]), family = bsttype[k], 
        learner = learntype[j])
        err1 <- predict(dat.m1, newdata=dte, newy=yte, type="error")
        err1tr <- predict(dat.m1, newdata=dtr, newy=ytr, type="loss")
        ### cross-validation to select best boosting iteration
        set.seed(1000+ii) 
        cvm1 <- cv.bst(x=dtr, y=ytr, K=5, n.cores=n.cores, ctrl = bst_control(mstop=m, center=FALSE, trace=FALSE, nu=nu[k], s=s[k], trun=ctr.trun[k]), family = bsttype[k], 
        learner = learntype[j], main=bsttype[k], type=tuning, plot.it=FALSE)
        optmstop <- max(10, which.min(cvm1$cv))
        err.m1[ii, i] <- err1[optmstop]
        nvar.m1[ii, i] <- nsel(dat.m1, optmstop)[optmstop]
        errbest.m1[ii, i] <- min(err1)     
        mstopbest.m1[ii, i] <- which.min(err1)
        mstopcv.m1[ii, i] <- optmstop
        dat.m2 <- rbst(x=dtr, y=ytr, ctrl = bst_control(mstop=m, iter=100, nu=nu[k], s=s[k], trun=ctr.trun[k], center=FALSE, trace=FALSE), rfamily = rbsttype[k], learner = learntype[j])
        err2 <- predict(dat.m2, newdata=dte, newy=yte, type="error")
        err2tr <- predict(dat.m2, newdata=dtr, newy=ytr, type="loss")
        ### cross-validation to select best boosting iteration
        set.seed(1000+ii) 
        cvm2 <- cv.rbst(x=dtr, y=ytr, K=5, n.cores=n.cores, ctrl = bst_control(mstop=m, iter=100, nu=nu[k], s=s[k], trun=ctr.trun[k], center=FALSE, trace=FALSE), rfamily = rbsttype[k], 
        learner = learntype[j], main=rbsttype[k], type=tuning, plot.it=FALSE)
        optmstop <- max(10, which.min(cvm2$cv))
        err.m2[ii, i] <- err2[optmstop]
        nvar.m2[ii, i] <- nsel(dat.m2, optmstop)[optmstop]
        errbest.m2[ii, i] <- min(err2)     
        mstopbest.m2[ii, i] <- which.min(err2)
        mstopcv.m2[ii, i] <- optmstop
      }
    }
    if(ii %% nrun ==0){
    if(bsttype[k]%in% c("closs", "gloss", "qloss"))
    cat(paste("\nbst family ", bsttype1[k], ", s=", s[k], ", nu=", nu[k], sep=""), "\n")
    if(bsttype[k]%in% c("binom", "hinge", "expo"))
    cat(paste("\nbst family ", bsttype1[k], ", nu=", nu[k], sep=""), "\n")
    cat("best misclassification error from bst\n")
    print(round(apply(errbest.m1, 2, summary7),4))
    cat("CV based misclassification error from bst\n")
    print(round(apply(err.m1, 2, summary7),4))
    cat("best mstop with best misclassification error from bst\n")
    print(round(apply(mstopbest.m1, 2, summary7),0))
    cat("best mstop with CV from bst\n")
    print(round(apply(mstopcv.m1, 2, summary7), 0))
    cat("nvar from bst\n")
    print(round(apply(nvar.m1, 2, summary7),1))
    
    cat(paste("\nrbst family ", rbsttype1[k], ", s=", s[k], ", nu=", nu[k], sep=""), "\n") 
    cat("\nbest misclassification error from rbst\n")
    print(round(apply(errbest.m2, 2, summary7),4))
    cat("CV based misclassification error from rbst\n")
    print(round(apply(err.m2, 2, summary7),4))
    cat("best mstop with best misclassification error from rbst\n")
    print(round(apply(mstopbest.m2, 2, summary7),0))
    cat("best mstop with CV from rbst\n")
    print(round(apply(mstopcv.m2, 2, summary7),0))
    cat("nvar from rbst\n")
    print(round(apply(nvar.m2, 2, summary7),1))
    res <- list(err.m1=err.m1, nvar.m1=nvar.m1, 
    errbest.m1=errbest.m1,
    mstopbest.m1=mstopbest.m1,
    mstopcv.m1=mstopcv.m1,
    err.m2=err.m2, nvar.m2=nvar.m2,
    errbest.m2=errbest.m2,
    mstopbest.m2=mstopbest.m2,
    mstopcv.m2=mstopcv.m2,
    s=s[k], nu=nu[k], trun=ctr.trun[k], family = bsttype[k], rfamily = rbsttype[k])
    if(plot.it){
      par(mfrow=c(2, 1))        
      boxplot(err.m1, main="Misclassification error", subset="", sub=bsttype1[k])
      boxplot(err.m2, main="Misclassification error", subset="", sub=rbsttype1[k])
      boxplot(nvar.m1, main="No. variables", subset="", sub=bsttype1[k])
      boxplot(nvar.m2, main="No. variables", subset="", sub=rbsttype1[k])
    }
    check <- FALSE
    if(check){
      par(mfrow=c(3, 1))
      title <- paste('percentage of contamination ', percon, sep='')
      plot(err2tr, main=title, ylab="Loss value", xlab="Iteration", type="l", lty="dashed", col="red")
      points(err1tr, type="l", lty="solid", col="black")
      legend("topright", c(bsttype1[k], rbsttype1[k]), lty=c("solid", "dashed"), col=c("black","red"))
      plot(err2, main=title, ylab="Misclassification error", xlab="Iteration", type="l", lty="dashed", col="red")
      points(err1, type="l")
      legend("bottomright", c(bsttype1[k], rbsttype1[k]), lty=c("solid", "dashed"), col=c("black","red"))
      plot(nsel(dat.m2, m), main=title, ylab="No. variables", xlab="Iteration", lty="dashed", col="red", type="l")
      points(nsel(dat.m1, m), ylab="No. variables", xlab="Iteration", lty="solid", type="l", col="black")
      legend("bottomright", c(bsttype1[k], rbsttype1[k]), lty=c("solid", "dashed"), col=c("black","red"))
    }
  }
}
}
  print(proc.time()-ptm)
  @

  <<sessionInfo>>=
  sessionInfo();
  @
  \bibliographystyle{plainnat}
  \bibliography{bst}

\end{document}

