Workflows are infrastructures for modelling in the context of the performanceEstimation package.
These allow the user to specify learning approaches that include steps beyond the simple train+test procedures.
Below thereโs an example of three workflows for regression tasks, with a straighforward Random Forest (with the ranger package), and two variants of such workflow: one with undersampling and another with oversampling.
mc.rf <- function(form,train,test,ntrees=500,...) {
require(ranger)
tgt <- which(colnames(train)==as.character(form[[2]]))
m <- ranger(form,train,num.trees=ntrees,write.forest=TRUE,...)
p <- predict(m, test)
p <- p$predictions
res <- list(trues=test[,tgt],preds=p)
res
}
mc.rf_U <- function(form,train,test,ntrees=500,percU=0.5,...) {
require(ranger)
tgt <- which(colnames(train)==as.character(form[[2]]))
new_train <- tryCatch({
if(is.numeric(percU)) {
UBL::RandUnderClassif(form, train, C.perc=list(percU))
} else {
UBL::RandUnderClassif(form, train, C.perc=percU)
}
},
error = function(cond) {
train
})
m <- ranger(form,new_train,num.trees=ntrees,write.forest=TRUE,...)
p <- predict(m, test)
p <- p$predictions
res <- list(trues=test[,tgt],preds=p)
res
}
mc.rf_O <- function(form,train,test,ntrees=500,percO=2,...) {
require(ranger)
tgt <- which(colnames(train)==as.character(form[[2]]))
new_train <- tryCatch({
if(is.numeric(percO)) {
UBL::RandOverClassif(form, train, C.perc=list(percO))
} else {
UBL::RandOverClassif(form, train, C.perc=percO)
}
},
error = function(cond) {
train
})
m <- ranger(form,new_train,num.trees=ntrees,write.forest=TRUE,...)
p <- predict(m, test)
p <- p$predictions
res <- list(trues=test[,tgt],preds=p)
res
}
The code below runs the workflows with a different grid for each of the workflows
After its done, you can use the performanceEstimation infrastructure to assess which are the models (learner+parametrization) with the best performance.
library(performanceEstimation)
library(mlbench)
data("PimaIndiansDiabetes")
dataset <- PimaIndiansDiabetes
formula <- diabetes ~ .
# Function to calculate AUC
AUC <- function(trues,preds,...) {
library(AUC) #install.packages("AUC")
c(auc=AUC::auc(roc(trues,preds)))
}
exp1 <- performanceEstimation(PredTask(formula,dataset),
c(workflowVariants("mc.rf",ntrees=c(100,250,500)),
workflowVariants("mc.rf_U",ntrees=c(100,250,500),percU=c(seq(0.1,0.9,by=0.1))),
workflowVariants("mc.rf_O",ntrees=c(100,250,500),percO=c(seq(1.25,3,by=0.25),4,5))),
EstimationTask(metrics="auc",method=CV(nReps = 2, nFolds=5),evaluator="AUC")
)
rankWorkflows(exp1,top = 60,maxs = TRUE)