################################################### ### The Available Data ################################################### source("http://bioconductor.org/biocLite.R") biocLite() biocLite("ALL") library(Biobase) library(ALL) data(ALL) ALL pD <- phenoData(ALL) varMetadata(pD) table(ALL$BT) table(ALL$mol.biol) table(ALL$BT,ALL$mol.bio) featureNames(ALL)[1:10] sampleNames(ALL)[1:5] tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] & ALL$mol.bio %in% levels(ALL$mol.bio)[1:4]) ALLb <- ALL[,tgt.cases] ALLb ALLb$BT <- factor(ALLb$BT) ALLb$mol.bio <- factor(ALLb$mol.bio) ################################################### ### Exploring the data set ################################################### es <- exprs(ALLb) dim(es) summary(as.vector(es)) source("http://bioconductor.org/biocLite.R") biocLite("genefilter") library(genefilter) hist(as.vector(es),breaks=80,prob=T, xlab='Expression Levels', main='Histogram of Overall Expression Levels') abline(v=c(median(as.vector(es)), shorth(as.vector(es)), quantile(as.vector(es),c(0.25,0.75))), lty=2,col=c(2,3,4,4)) legend('topright',c('Median','Shorth','1stQ','3rdQ'), lty=2,col=c(2,3,4,4)) sapply(levels(ALLb$mol.bio),function(x) summary(as.vector(es[,which(ALLb$mol.bio == x)]))) ################################################### ### Gene (Feature) Selection ################################################### rowIQRs <- function(em) rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em))) plot(rowMedians(es),rowIQRs(es), xlab='Median expression level', ylab='IQR expression level', main='Main Characteristics of Genes Expression Levels') library(genefilter) ALLb <- nsFilter(ALLb, var.func=IQR, var.cutoff=IQR(as.vector(es))/5, feature.exclude="^AFFX") ALLb ALLb <- ALLb$eset es <- exprs(ALLb) dim(es) f <- Anova(ALLb$mol.bio,p=0.01) ff <- filterfun(f) selGenes <- genefilter(exprs(ALLb),ff) sum(selGenes) ALLb <- ALLb[selGenes,] ALLb es <- exprs(ALLb) plot(rowMedians(es),rowIQRs(es), xlab='Median expression level', ylab='IQR expression level', main='Distribution Properties of the Selected Genes') featureNames(ALLb) <- make.names(featureNames(ALLb)) es <- exprs(ALLb) library(randomForest) dt <- data.frame(t(es),Mut=ALLb$mol.bio) rf <- randomForest(Mut ~ .,dt,importance=T) imp <- importance(rf) imp <- imp[,ncol(imp)-1] rf.genes <- names(imp)[order(imp,decreasing=T)[1:30]] sapply(rf.genes,function(g) tapply(dt[,g],dt$Mut,median)) library(lattice) ordMut <- order(dt$Mut) levelplot(as.matrix(dt[ordMut,rf.genes]), aspect='fill', xlab='', ylab='', scales=list( x=list( labels=c('+','-','*','|')[as.integer(dt$Mut[ordMut])], cex=0.7, tck=0) ), main=paste(paste(c('"+"','"-"','"*"','"|"'), levels(dt$Mut) ), collapse='; '), col.regions=colorRampPalette(c('white','orange','blue')) ) library(Hmisc) vc <- varclus(t(es)) clus30 <- cutree(vc$hclust,30) table(clus30) getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=F) { if (!is.null(seed)) set.seed(seed) cls <- cutree(cluster,nvars) tots <- table(cls) vars <- c() vars <- sapply(1:nvars,function(clID) { if (!length(tots[clID])) stop('Empty cluster! (',clID,')') x <- sample(1:tots[clID],1) names(cls[cls==clID])[x] }) if (verb) structure(vars,clusMemb=cls,clusTots=tots) else vars } getVarsSet(vc$hclust) getVarsSet(vc$hclust) ################################################### ### Predicting Cytogenetic Abnormalities ################################################### data(iris) rpart.loocv <- function(form,train,test,...) { require(rpart,quietly=T) m <- rpart(form,train,...) p <- predict(m,test,type='class') c(accuracy=ifelse(p == resp(form,test),100,0)) } exp <- loocv(learner('rpart.loocv',list()), dataset(Species~.,iris), loocvSettings(seed=1234,verbose=F)) summary(exp) library(class) data(iris) idx <- sample(1:nrow(iris),as.integer(0.7*nrow(iris))) tr <- iris[idx,] ts <- iris[-idx,] preds <- knn(tr[,-5],ts[,-5],tr[,5],k=3) table(preds,ts[,5]) kNN <- function(form,train,test,norm=T,norm.stats=NULL,...) { require(class,quietly=TRUE) tgtCol <- which(colnames(train)==as.character(form[[2]])) if (norm) { if (is.null(norm.stats)) tmp <- scale(train[,-tgtCol],center=T,scale=T) else tmp <- scale(train[,-tgtCol],center=norm.stats[[1]],scale=norm.stats[[2]]) train[,-tgtCol] <- tmp ms <- attr(tmp,"scaled:center") ss <- attr(tmp,"scaled:scale") test[,-tgtCol] <- scale(test[,-tgtCol],center=ms,scale=ss) } knn(train[,-tgtCol],test[,-tgtCol],train[,tgtCol],...) } preds.norm <- kNN(Species ~ .,tr,ts,k=3) table(preds.norm,ts[,5]) preds.notNorm <- kNN(Species ~ .,tr,ts,norm=F,k=3) table(preds.notNorm,ts[,5]) vars <- list() vars$randomForest <- list(ntree=c(500,750,100), mtry=c(5,15,30), fs.meth=list(list('all'), list('rf',30), list('varclus',30,50))) vars$svm <- list(cost=c(1,100,500), gamma=c(0.01,0.001,0.0001), fs.meth=list(list('all'), list('rf',30), list('varclus',30,50))) vars$knn <- list(k=c(3,5,7,11), norm=c(T,F), fs.meth=list(list('all'), list('rf',30), list('varclus',30,50))) varsEnsembles <- function(tgt,train,test, varsSets, baseLearner,blPars, verb=F) { preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test)) for(v in seq(along=varsSets)) { if (baseLearner=='knn') preds[,v] <- knn(train[,varsSets[[v]]], test[,varsSets[[v]]], train[,tgt],blPars) else { m <- do.call(baseLearner, c(list(as.formula(paste(tgt, paste(varsSets[[v]], collapse='+'), sep='~')), train[,c(tgt,varsSets[[v]])]), blPars) ) if (baseLearner == 'randomForest') preds[,v] <- do.call('predict', list(m,test[,c(tgt,varsSets[[v]])], type='response')) else preds[,v] <- do.call('predict', list(m,test[,c(tgt,varsSets[[v]])])) } } ps <- apply(preds,1,function(x) levels(factor(x))[which.max(table(factor(x)))]) ps <- factor(ps, levels=1:nlevels(train[,tgt]), labels=levels(train[,tgt])) if (verb) structure(ps,ensemblePreds=preds) else ps } genericModel <- function(form,train,test, learner, fs.meth, ...) { cat('=') tgt <- as.character(form[[2]]) tgtCol <- which(colnames(train)==tgt) # Anova filtering f <- Anova(train[,tgt],p=0.01) ff <- filterfun(f) genes <- genefilter(t(train[,-tgtCol]),ff) genes <- names(genes)[genes] train <- train[,c(tgt,genes)] test <- test[,c(tgt,genes)] tgtCol <- 1 # Specific filtering if (fs.meth[[1]]=='varclus') { require(Hmisc,quietly=T) v <- varclus(as.matrix(train[,-tgtCol])) VSs <- lapply(1:fs.meth[[3]],function(x) getVarsSet(v$hclust,nvars=fs.meth[[2]])) pred <- varsEnsembles(tgt,train,test,VSs,learner,list(...)) } else { if (fs.meth[[1]]=='rf') { require(randomForest,quietly=T) rf <- randomForest(form,train,importance=T) imp <- importance(rf) imp <- imp[,ncol(imp)-1] rf.genes <- names(imp)[order(imp,decreasing=T)[1:fs.meth[[2]]]] train <- train[,c(tgt,rf.genes)] test <- test[,c(tgt,rf.genes)] } if (learner == 'knn') pred <- kNN(form, train, test, norm.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))), rowIQRs(t(as.matrix(train[,-tgtCol])))), ...) else { model <- do.call(learner,c(list(form,train),list(...))) pred <- if (learner != 'randomForest') predict(model,test) else predict(model,test,type='response') } } c(accuracy=ifelse(pred == resp(form,test),100,0)) } require(class,quietly=TRUE) require(randomForest,quietly=TRUE) require(e1071,quietly=TRUE) load('myALL.Rdata') es <- exprs(ALLb) # simple filtering ALLb <- nsFilter(ALLb, var.func=IQR,var.cutoff=IQR(as.vector(es))/5, feature.exclude="^AFFX") ALLb <- ALLb$eset # the data set featureNames(ALLb) <- make.names(featureNames(ALLb)) dt <- data.frame(t(exprs(ALLb)),Mut=ALLb$mol.bio) DSs <- list(dataset(Mut ~ .,dt,'ALL')) # The learners to evaluate TODO <- c('knn','svm','randomForest') for(td in TODO) { assign(td, experimentalComparison( DSs, c( do.call('variants', c(list('genericModel',learner=td), vars[[td]], varsRootName=td)) ), loocvSettings(seed=1234,verbose=F) ) ) save(list=td,file=paste(td,'Rdata',sep='.')) } load('knn.Rdata') load('svm.Rdata') load('randomForest.Rdata') rankSystems(svm,max=T) all.trials <- join(svm,knn,randomForest,by='variants') rankSystems(all.trials,top=10,max=T) getVariant('knn.v2',all.trials) bestknn.loocv <- function(form,train,test,...) { require(Biobase,quietly=T) require(randomForest,quietly=T) cat('=') tgt <- as.character(form[[2]]) tgtCol <- which(colnames(train)==tgt) # Anova filtering f <- Anova(train[,tgt],p=0.01) ff <- filterfun(f) genes <- genefilter(t(train[,-tgtCol]),ff) genes <- names(genes)[genes] train <- train[,c(tgt,genes)] test <- test[,c(tgt,genes)] tgtCol <- 1 # Random Forest filtering rf <- randomForest(form,train,importance=T) imp <- importance(rf) imp <- imp[,ncol(imp)-1] rf.genes <- names(imp)[order(imp,decreasing=T)[1:30]] train <- train[,c(tgt,rf.genes)] test <- test[,c(tgt,rf.genes)] # knn prediction ps <- kNN(form,train,test,norm=T, norm.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))), rowIQRs(t(as.matrix(train[,-tgtCol])))), k=5,...) structure(c(accuracy=ifelse(ps == resp(form,test),100,0)), itInfo=list(ps) ) } resTop <- loocv(learner('bestknn.loocv',pars=list()), dataset(Mut~.,dt), loocvSettings(seed=1234,verbose=F), itsInfo=T) attr(resTop,'itsInfo')[1:4] dt <- data.frame(t(exprs(ALLb)),Mut=ALLb$mol.bio) preds <- unlist(attr(resTop,'itsInfo')) table(preds,dt$Mut)