Data Mining With R - learning with case studies

Chapters
Chapter 1: Introduction
Chapter 2: Predicting Algae Blooms
Chapter 3: Predicting Stock Market Returns
Chapter 4: Detecting Fraudulent Transactions
Chapter 5: Classifying Microarray Samples
Other Information
Book R Package
R Code of Chapter 5 (right-click here to save the code in a local file)
###################################################
### The Available Data
###################################################
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite("ALL")


library(Biobase)
library(ALL)
data(ALL)


ALL


pD <- phenoData(ALL)
varMetadata(pD)


table(ALL$BT)
table(ALL$mol.biol)
table(ALL$BT,ALL$mol.bio)


featureNames(ALL)[1:10]
sampleNames(ALL)[1:5]


tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] & 
                   ALL$mol.bio %in% levels(ALL$mol.bio)[1:4])
ALLb <- ALL[,tgt.cases]
ALLb


ALLb$BT <- factor(ALLb$BT)
ALLb$mol.bio <- factor(ALLb$mol.bio)


###################################################
### Exploring the data set
###################################################
es <- exprs(ALLb)
dim(es)


summary(as.vector(es))


source("http://bioconductor.org/biocLite.R")
biocLite("genefilter")


library(genefilter)
hist(as.vector(es),breaks=80,prob=T,
     xlab='Expression Levels',
     main='Histogram of Overall Expression Levels')
abline(v=c(median(as.vector(es)),
           shorth(as.vector(es)),
           quantile(as.vector(es),c(0.25,0.75))),
       lty=2,col=c(2,3,4,4))
legend('topright',c('Median','Shorth','1stQ','3rdQ'),
       lty=2,col=c(2,3,4,4))


sapply(levels(ALLb$mol.bio),function(x) summary(as.vector(es[,which(ALLb$mol.bio == x)])))



###################################################
### Gene (Feature) Selection
###################################################
rowIQRs <- function(em) 
  rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
plot(rowMedians(es),rowIQRs(es),
     xlab='Median expression level',
     ylab='IQR expression level',
     main='Main Characteristics of Genes Expression Levels')


library(genefilter)
ALLb <- nsFilter(ALLb,
                 var.func=IQR,
                 var.cutoff=IQR(as.vector(es))/5, 
                 feature.exclude="^AFFX")
ALLb


ALLb <- ALLb$eset
es <- exprs(ALLb)
dim(es)


f <- Anova(ALLb$mol.bio,p=0.01)
ff <- filterfun(f)
selGenes <- genefilter(exprs(ALLb),ff)


sum(selGenes) 
ALLb <- ALLb[selGenes,]
ALLb


es <- exprs(ALLb)
plot(rowMedians(es),rowIQRs(es),
     xlab='Median expression level',
     ylab='IQR expression level',
     main='Distribution Properties of the Selected Genes')


featureNames(ALLb) <- make.names(featureNames(ALLb))
es <- exprs(ALLb)


library(randomForest)
dt <- data.frame(t(es),Mut=ALLb$mol.bio)
rf <- randomForest(Mut ~  .,dt,importance=T)
imp <- importance(rf)
imp <- imp[,ncol(imp)-1]
rf.genes <- names(imp)[order(imp,decreasing=T)[1:30]]


sapply(rf.genes,function(g) tapply(dt[,g],dt$Mut,median))


library(lattice)
ordMut <- order(dt$Mut)
levelplot(as.matrix(dt[ordMut,rf.genes]),
          aspect='fill', xlab='', ylab='',
          scales=list(
            x=list(
              labels=c('+','-','*','|')[as.integer(dt$Mut[ordMut])],
              cex=0.7,
              tck=0)
            ),
          main=paste(paste(c('"+"','"-"','"*"','"|"'),
                           levels(dt$Mut)
                          ),
                     collapse='; '),
          col.regions=colorRampPalette(c('white','orange','blue'))
          )


library(Hmisc)
vc <- varclus(t(es))
clus30 <- cutree(vc$hclust,30)
table(clus30)


getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=F) 
{
  if (!is.null(seed)) set.seed(seed)

  cls <- cutree(cluster,nvars)
  tots <- table(cls)
  vars <- c()
  vars <- sapply(1:nvars,function(clID)
    {
      if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
      x <- sample(1:tots[clID],1)
      names(cls[cls==clID])[x]
    })
  if (verb)  structure(vars,clusMemb=cls,clusTots=tots)
  else       vars
}
getVarsSet(vc$hclust)
getVarsSet(vc$hclust)



###################################################
### Predicting Cytogenetic Abnormalities
###################################################
data(iris)
rpart.loocv <- function(form,train,test,...) {
  require(rpart,quietly=T)
  m <- rpart(form,train,...)
  p <- predict(m,test,type='class')
  c(accuracy=ifelse(p == resp(form,test),100,0))
}
exp <- loocv(learner('rpart.loocv',list()),
             dataset(Species~.,iris),
             loocvSettings(seed=1234,verbose=F))


summary(exp)


library(class)
data(iris)
idx <- sample(1:nrow(iris),as.integer(0.7*nrow(iris)))
tr <- iris[idx,]
ts <- iris[-idx,]
preds <- knn(tr[,-5],ts[,-5],tr[,5],k=3)
table(preds,ts[,5])


kNN <- function(form,train,test,norm=T,norm.stats=NULL,...) {
  require(class,quietly=TRUE)
  tgtCol <- which(colnames(train)==as.character(form[[2]]))
  if (norm) {
    if (is.null(norm.stats)) tmp <- scale(train[,-tgtCol],center=T,scale=T)
    else tmp <- scale(train[,-tgtCol],center=norm.stats[[1]],scale=norm.stats[[2]])
    train[,-tgtCol] <- tmp
    ms <- attr(tmp,"scaled:center")
    ss <- attr(tmp,"scaled:scale")
    test[,-tgtCol] <- scale(test[,-tgtCol],center=ms,scale=ss)
  }
  knn(train[,-tgtCol],test[,-tgtCol],train[,tgtCol],...)
}
preds.norm <- kNN(Species ~ .,tr,ts,k=3)
table(preds.norm,ts[,5])
preds.notNorm <- kNN(Species ~ .,tr,ts,norm=F,k=3)
table(preds.notNorm,ts[,5])


vars <- list()
vars$randomForest <- list(ntree=c(500,750,100),
                          mtry=c(5,15,30),
                          fs.meth=list(list('all'),
                                       list('rf',30),
                                       list('varclus',30,50)))
vars$svm <- list(cost=c(1,100,500),
                 gamma=c(0.01,0.001,0.0001),
                 fs.meth=list(list('all'),
                              list('rf',30),
                              list('varclus',30,50)))
vars$knn <- list(k=c(3,5,7,11),
                 norm=c(T,F),
                 fs.meth=list(list('all'),
                              list('rf',30),
                              list('varclus',30,50)))


varsEnsembles <- function(tgt,train,test,
                          varsSets,
                          baseLearner,blPars,
                          verb=F)
{
  preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
  for(v in seq(along=varsSets)) {
    if (baseLearner=='knn')
      preds[,v] <- knn(train[,varsSets[[v]]],
                       test[,varsSets[[v]]],
                       train[,tgt],blPars)
    else {
      m <- do.call(baseLearner,
                   c(list(as.formula(paste(tgt,
                                           paste(varsSets[[v]],
                                                 collapse='+'),
                                           sep='~')),
                          train[,c(tgt,varsSets[[v]])]),
                     blPars)
                   )
      if (baseLearner == 'randomForest')
        preds[,v] <- do.call('predict',
                             list(m,test[,c(tgt,varsSets[[v]])],
                                  type='response'))
      else
        preds[,v] <- do.call('predict',
                             list(m,test[,c(tgt,varsSets[[v]])]))
    }
  }
  ps <- apply(preds,1,function(x)
                 levels(factor(x))[which.max(table(factor(x)))])
  ps <- factor(ps,
               levels=1:nlevels(train[,tgt]),
               labels=levels(train[,tgt]))
  if (verb) structure(ps,ensemblePreds=preds) else ps
}


genericModel <- function(form,train,test,
                         learner,
                         fs.meth,
                         ...)
  {
    cat('=')
    tgt <- as.character(form[[2]])
    tgtCol <- which(colnames(train)==tgt)

    # Anova filtering  
    f <- Anova(train[,tgt],p=0.01)
    ff <- filterfun(f)
    genes <- genefilter(t(train[,-tgtCol]),ff)
    genes <- names(genes)[genes]
    train <- train[,c(tgt,genes)]
    test <- test[,c(tgt,genes)]
    tgtCol <- 1

    # Specific filtering 
    if (fs.meth[[1]]=='varclus') {
      require(Hmisc,quietly=T)
      v <- varclus(as.matrix(train[,-tgtCol]))
      VSs <- lapply(1:fs.meth[[3]],function(x)
                    getVarsSet(v$hclust,nvars=fs.meth[[2]]))
      pred <- varsEnsembles(tgt,train,test,VSs,learner,list(...))

    } else {
      if (fs.meth[[1]]=='rf') {
        require(randomForest,quietly=T)
        rf <- randomForest(form,train,importance=T)
        imp <- importance(rf)
        imp <- imp[,ncol(imp)-1]
        rf.genes <- names(imp)[order(imp,decreasing=T)[1:fs.meth[[2]]]]
        train <- train[,c(tgt,rf.genes)]
        test <- test[,c(tgt,rf.genes)]
      }

      if (learner == 'knn') 
        pred <- kNN(form,
             train,
             test,
             norm.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
                             rowIQRs(t(as.matrix(train[,-tgtCol])))),
             ...)
      else {
        model <- do.call(learner,c(list(form,train),list(...)))
        pred <- if (learner != 'randomForest') predict(model,test)
                else predict(model,test,type='response')
      }

    }

    c(accuracy=ifelse(pred == resp(form,test),100,0))
  }


require(class,quietly=TRUE)
require(randomForest,quietly=TRUE)
require(e1071,quietly=TRUE)
load('myALL.Rdata')
es <- exprs(ALLb)
# simple filtering
ALLb <- nsFilter(ALLb,
                 var.func=IQR,var.cutoff=IQR(as.vector(es))/5, 
                 feature.exclude="^AFFX")
ALLb <- ALLb$eset
# the data set
featureNames(ALLb) <- make.names(featureNames(ALLb))
dt <- data.frame(t(exprs(ALLb)),Mut=ALLb$mol.bio)
DSs <- list(dataset(Mut ~ .,dt,'ALL'))
# The learners to evaluate
TODO <- c('knn','svm','randomForest')
for(td in TODO) {
  assign(td,
         experimentalComparison(
              DSs,
              c(
                do.call('variants',
                        c(list('genericModel',learner=td),
                          vars[[td]],
                          varsRootName=td))
                ),
               loocvSettings(seed=1234,verbose=F)
                                )
         )
  save(list=td,file=paste(td,'Rdata',sep='.'))
}


load('knn.Rdata')
load('svm.Rdata')
load('randomForest.Rdata')


rankSystems(svm,max=T)


all.trials <- join(svm,knn,randomForest,by='variants')


rankSystems(all.trials,top=10,max=T)


getVariant('knn.v2',all.trials)


bestknn.loocv <- function(form,train,test,...) {
  require(Biobase,quietly=T)
  require(randomForest,quietly=T)
  cat('=')
  tgt <- as.character(form[[2]])
  tgtCol <- which(colnames(train)==tgt)
  # Anova filtering
  f <- Anova(train[,tgt],p=0.01)
  ff <- filterfun(f)
  genes <- genefilter(t(train[,-tgtCol]),ff)
  genes <- names(genes)[genes]
  train <- train[,c(tgt,genes)]
  test <- test[,c(tgt,genes)]
  tgtCol <- 1
  # Random Forest filtering
  rf <- randomForest(form,train,importance=T)
  imp <- importance(rf)
  imp <- imp[,ncol(imp)-1]
  rf.genes <- names(imp)[order(imp,decreasing=T)[1:30]]
  train <- train[,c(tgt,rf.genes)]
  test <- test[,c(tgt,rf.genes)]
  # knn prediction
  ps <- kNN(form,train,test,norm=T, 
            norm.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
                            rowIQRs(t(as.matrix(train[,-tgtCol])))),
            k=5,...)
  structure(c(accuracy=ifelse(ps == resp(form,test),100,0)),
            itInfo=list(ps)
           )
}
resTop <- loocv(learner('bestknn.loocv',pars=list()),
                dataset(Mut~.,dt),
                loocvSettings(seed=1234,verbose=F),
                itsInfo=T)


attr(resTop,'itsInfo')[1:4]


dt <- data.frame(t(exprs(ALLb)),Mut=ALLb$mol.bio)


preds <- unlist(attr(resTop,'itsInfo'))
table(preds,dt$Mut)