#########################################################
# This module implements some strategies of handling tasks
# of predicting rare extreme values of a continuous target
# variable.
# Namely, the strategies are based on sampling approaches
# that try to rebalance the distribution of the target variable
# (in a way similar to the existing approaches in classification)
#
# The current methods implemented are:
# SMOTE.R() a re-implementation of the smote algorithm for
#           regression
# random.underSample() undersampling the normal cases randomly
#
#########################################################

## This package is not available in the standard R package repository
## and thus needs to be installed through the package file we provide
## at the place were you got this script
require(uba)          # Ribeiro's utility-based evaluation framework


# ===================================================
# Learning a regression model using a SMOTE for regression
# resampling strategy.
#
# If called with learner=NULL (the default) is does not
# learn any model, simply returning the SMOTEd data set
#
# Examples:
#  ms <- SMOTE.R(medv ~ .,d$train,perc.under=400,perc.over=300,
#                learner='svm',gamma=0.001,cost=100)
#  newds <- SMOTE.R(fr ~ .,IBMdata,perc.under=300,k=3,perc.over=400)
#
# ---------------------------------------------------
SMOTE.R <- function(form,data,
                    thr.rel=0.5,
                    perc.over=200,k=5,
                    perc.under=200,
                    learner=NULL,...
                    )
  
  # INPUTS:
  # form - a model formula
  # data - the original training set (with the unbalanced distribution)
  # thr.rel - is the relevance threshold above which a case is considered
  #         as belonging to the rare "class"
  # per.over/100 - is the number of new cases (smoted cases) generated
  #                for each rare case. If perc.over < 100 a single case
  #                is generated uniquely for a randomly selected perc.over
  #                of the rare cases
  # k - is the number of neighbours to consider as the pool from where
  #     the new generated examples are generated
  # perc.under/100 - is the number of "normal" cases that are randomly
  #                  selected for each smoted case
  # learner - the learning system to use.
  # ...  any learning parameters to pass to learner
{
  
  # Obtaining the relevance function and parameters of the target var.
  y <- resp(form,data)
  pc <- phi.control(y, method="extremes")
  both <- all(pc$control.pts[c(2,8)] == c(1,1))
  y.relev <- phi(y,pc)
  
  # the column where the target variable is
  tgt <- target.col(form,data)

  
  if (both) {  # we have both low and high extrs
    rare.low   <- which(y.relev > thr.rel & y < pc$control.pts[4])
    smote.exsL <- smote.exs(data[rare.low,],tgt,perc.over,k)
    rare.high  <- which(y.relev > thr.rel & y > pc$control.pts[4])
    smote.exsH <- smote.exs(data[rare.high,],tgt,perc.over,k)
    rare.cases <- c(rare.low,rare.high)
    smoted.exs <- rbind(smote.exsL,smote.exsH)
  } else {
    # the indexes of the cases with rare target variable values
    rare.cases <- if (pc$control.pts[2] == 1)  which(y.relev > thr.rel & y < pc$control.pts[4]) else which(y.relev > thr.rel & y > pc$control.pts[4])

    # Get the smoted examples associated with these rare cases
    smoted.exs <- smote.exs(data[rare.cases,],tgt,perc.over,k)
  }

  # get the undersample of the "majority class" examples
  sel.maj <- sample((1:NROW(data))[-rare.cases],
                    as.integer((perc.under/100)*nrow(smoted.exs)),
                    replace=T)

  # the final data set (the undersample+the rare cases+the smoted exs)
  newdataset <- rbind(data[sel.maj,],data[rare.cases,],smoted.exs)

  # learn a model if required
  if (is.null(learner)) return(newdataset)
  else do.call(learner,list(form,newdataset,...))
}


# ===================================================
# Obtain a set of smoted examples for a set of rare cases.
#
# Examples:
# 
# ---------------------------------------------------
smote.exs <- function(T,tgt,N,k)
  # INPUTS:
  # T are the rare cases (the minority "class" cases)
  # tgt the column nr of the target variable
  # N is the percentage of over-sampling to carry out;
  # and k is the number of nearest neighours
  # OUTPUTS:
  # The result of the function is a (N/100)*T set of generate
  # examples with rare values on the target
{
  p <- dim(T)[2]

  nomattrs <- c()
  lvls <- list()
  for(x in (1:p)[-tgt])
    if (!is.numeric(T[1,x])) {
      nomattrs <- c(nomattrs,x)
      lvls[[as.character(x)]] <- list(l=levels(T[,x]),n=nlevels(T[,x]))
    }
  ranges <- rep(1,p)
  for(x in (1:p)[-c(nomattrs,tgt)]) ranges[x] <- max(T[,x]) - min(T[,x])
  
  if (N < 100) { # only a percentage of the T cases will be SMOTEd
    nT <- NROW(T)
    idx <- sample(1:nT,as.integer((N/100)*nT))
    T <- T[idx,]
    N <- 100
  }

  for(x in nomattrs) T[,x] <- as.integer(T[,x])
  T <- as.matrix(T)
  
  nT <- dim(T)[1]

  nexs <-  as.integer(N/100) # this is the number of artificial exs generated
                             # for each member of T

  new <- matrix(nrow=nexs*nT,ncol=p)    # the new cases

  for(i in 1:nT) {

    # the k NNs of case T[i,]
    xd <- scale(T,T[i,],ranges)
    for(a in nomattrs) xd[,a] <- xd[,a]!=0
    dd <- drop(xd[,-tgt]^2 %*% rep(1, ncol(xd)-1))
    kNNs <- order(dd)[2:(k+1)]
    
    for(n in 1:nexs) {
      # select randomly one of the k NNs
      neig <- sample(1:k,1)

      # the attribute values of the generated case
      idx <- (i-1)*nexs+n 
      difs <- T[kNNs[neig],-tgt]-T[i,-tgt]
      new[idx,-tgt] <- T[i,-tgt]+runif(1)*difs
      for(a in nomattrs)
        new[idx,a] <- c(T[kNNs[neig],a],T[i,a])[1+round(runif(1),0)]
#      ex[-tgt] <- T[i,-tgt]+runif(1)*difs

      # now the target value (weighted (by inverse distance) average)
      d1 <- d2 <- 0
      for(x in (1:p)[-c(nomattrs,tgt)]) {
        d1 <- abs(T[i,x] - new[idx,x])/ranges[x]
        d2 <- abs(T[kNNs[neig],x] - new[idx,x])/ranges[x]
      }
      if (length(nomattrs)) {
        d1 <- d1 + sum(T[i,nomattrs] != new[idx,nomattrs])
        d2 <- d2 + sum(T[kNNs[neig],nomattrs] != new[idx,nomattrs])
      }
      # (d2+d1-d1 = d2 and d2+d1-d2 = d1) the more distant the less weight
      new[idx,tgt] <- if (d1 == d2) (T[i,tgt]+T[kNNs[neig],tgt])/2 else (d2*T[i,tgt]+d1*T[kNNs[neig],tgt])/(d1+d2)

    }
  }

  newCases <- data.frame(new)
  for(a in nomattrs)
    newCases[,a] <- factor(newCases[,a],levels=1:lvls[[as.character(a)]]$n,labels=lvls[[as.character(a)]]$l)

  colnames(newCases) <- colnames(T)
  newCases

}


# ===================================================
# Learning a model using an undersampling strategy,
# which basically uses as many cases of the negative
# class as there are of the positive class
#
# ---------------------------------------------------
random.underSample <- function(form,data,
#                               relev.fun="extremity",relev.args=NULL,
#                               relev.pars=NULL,
                               thr.rel=0.5,
                               perc.under=200,
                               learner=NULL,...)
  # INPUTS:
  # form a model formula
  # data the original training set (with the unbalanced distribution)
  # relev.pars the parameters of the phi relevance function. If null (default)
  #            they are estimated from the given data
  # thr.rel is the relevance threshold above which a case is considered
  #         as belonging to the rare "class"
  # perc.under/100 is the number of "normal" cases that are randomly
  #                selected for each rare case
  # learner the learning system to use.
  # ...  any learning parameters to pass to learner

  
{
  y <- resp(form,data)
  pc <- phi.control(y, method="extremes")
  both <- all(pc$control.pts[c(2,8)] == c(1,1))
  y.relev <- phi(y,pc)


  # the indexes of the cases with rare target variable values
  rare.cases <- which(y.relev > thr.rel)

  
  # start by adding all the cases of the minority class
  newdata <- data[rare.cases,]

  # get the undersample of the "majority class" examples
  sel.maj <- sample((1:NROW(data))[-rare.cases],
                    as.integer((perc.under/100)*nrow(newdata)),
                    replace=T)

  newdata <- rbind(newdata,data[sel.maj,])

  if (is.null(learner)) return(newdata)
  else do.call(learner,list(form,newdata,...))
}

  
# =================================================
# Given a data set and a formula, get the name of the
# target variable.
# -------------------------------------------------
target.name <- function(form,data) {
  mt <- terms(form,data=data)
  if ((yvar <- attr(mt, "response")) <= 0) 
    stop(paste("Incorrect response variable",yvar))
  as.character(attr(mt, "variables"))[-1][yvar]
}
# ======================================================================
# Get the column on which the target variable in a formula object occurs
# in a dataset.
# ======================================================================
target.col <- function(form,data) {
  mt <- terms(form,data=data)
  if ((yvar <- attr(mt, "response")) <= 0) 
    stop(paste("Incorrect response variable",yvar))
  which(names(data)==as.character(attr(mt, "variables"))[-1][yvar])
}