Introducing Workflows

Workflows are infrastructures for modelling in the context of the performanceEstimation package.

These allow the user to specify learning approaches that include steps beyond the simple train+test procedures.

Below thereโ€™s an example of three workflows for regression tasks, with a straighforward Random Forest (with the ranger package), and two variants of such workflow: one with undersampling and another with oversampling.

mc.rf <- function(form,train,test,ntrees=500,...) {
  
  require(ranger)
  
  tgt <- which(colnames(train)==as.character(form[[2]]))
  
  m <- ranger(form,train,num.trees=ntrees,write.forest=TRUE,...)
  p <- predict(m, test)
  p <- p$predictions
  
  res <- list(trues=test[,tgt],preds=p)
  res
  
}

mc.rf_U <- function(form,train,test,ntrees=500,percU=0.5,...) {
  
  require(ranger)
  
  tgt <- which(colnames(train)==as.character(form[[2]]))
  
  new_train <- tryCatch({
    
    if(is.numeric(percU)) {
      UBL::RandUnderClassif(form, train, C.perc=list(percU))
    } else {
      UBL::RandUnderClassif(form, train, C.perc=percU)
    }
    
  },
    error = function(cond) {
      train
    })
  
  m <- ranger(form,new_train,num.trees=ntrees,write.forest=TRUE,...)
  p <- predict(m, test)
  p <- p$predictions
  
  res <- list(trues=test[,tgt],preds=p)
  res
  
}

mc.rf_O <- function(form,train,test,ntrees=500,percO=2,...) {
  
  require(ranger)
  
  tgt <- which(colnames(train)==as.character(form[[2]]))
  
  new_train <- tryCatch({
    
    if(is.numeric(percO)) {
      UBL::RandOverClassif(form, train, C.perc=list(percO))
    } else {
      UBL::RandOverClassif(form, train, C.perc=percO)
    }
    
  },
    error = function(cond) {
      train
    })
  
  m <- ranger(form,new_train,num.trees=ntrees,write.forest=TRUE,...)
  p <- predict(m, test)
  p <- p$predictions
  
  
  res <- list(trues=test[,tgt],preds=p)
  res
  
}

Comparison and Evaluation

The code below runs the workflows with a different grid for each of the workflows

After its done, you can use the performanceEstimation infrastructure to assess which are the models (learner+parametrization) with the best performance.

library(performanceEstimation)

library(mlbench)
data("PimaIndiansDiabetes")
dataset <- PimaIndiansDiabetes
formula <- diabetes ~ .

# Function to calculate AUC
AUC <- function(trues,preds,...) {
  library(AUC) #install.packages("AUC")
  c(auc=AUC::auc(roc(trues,preds)))
}


exp1 <- performanceEstimation(PredTask(formula,dataset),
  c(workflowVariants("mc.rf",ntrees=c(100,250,500)),
    workflowVariants("mc.rf_U",ntrees=c(100,250,500),percU=c(seq(0.1,0.9,by=0.1))),
    workflowVariants("mc.rf_O",ntrees=c(100,250,500),percO=c(seq(1.25,3,by=0.25),4,5))),
  EstimationTask(metrics="auc",method=CV(nReps = 2, nFolds=5),evaluator="AUC")
)

rankWorkflows(exp1,top = 60,maxs = TRUE)