################################################################# 
# THIS FILE CONTAINS THE IMPLEMENTATION OF THE "UTILITY-BASED   #
# OUTLIER RANKINGS" DESCRIBED IN THE PAPER:                     #
# Torgo,L and Lopes,E.: Utility-based Fraud Detection. In       #
#   Proceedings of the 22nd IJCAI'2011, p.1517-1522, AAAI Press.#
#                                                               #
# NOTE: If you do you this code please cite the above work      #
#################################################################
# Author : Luis Torgo (ltorgo@dcc.fc.up.pt)      Date: Mar 2011 #
# License: GPL (>= 2)                                           #
#################################################################


# ==============================================================
# The function implementing the proposed algorithm (Algorithm 1)
# --------------------------------------------------------------
# Parameters:
# HD - a data frame containing the historical data (data containing
#      information on the costs and benefits.
# ID - a data frame containing the data for which we want to obtain
#      an utility-based inspection ranking.
# regrLearner - the name of an R function that implements a regression
#               algorithm that is to be used for forecasting the
#               costs and benefits of new cases.
# outLearner - the name of an R function that implements an outlier
#              ranker.
# uf - an utility function.
# descrCols - a vector with the columns of the HD and ID data frames
#            that describe each observation.
# costCol - the column of the HD data frame that contains the cost
#           information.
# benCol - the column of the HD data frame that contains the benefit
#          information.
#
# Output:
# The result of this function is a list with five components:
# EU - a vector  with as many positions as there are cases to rank,
#      were position i of the vector contains the expected utility value.
# EUrank - a vector with as many positions as there are cases to rank,
#          were contains the indexes ranked by decreasing EU.
# EC - a vector with as many positions as there are cases to rank, where
#      the vector contains the estimated inspection cost of each observation.
# EB - a vector with as many positions as there are cases to rank, where
#      the vector contains the estimated benefit (payoff) of each observation
#      if confirmed fraudulent.
# outP - a list with two components. Component rank contains a vector
#        with as many positions as there are cases to rank, where
#        position i of the vector contains the rank order of the
#        observation i. Component score is another vector with the
#        same size this time containing the estimated probability of each
#        observation being a fraud.
#
# Example use:
# library(DMwR)
# orhCall <- function(HD,ID,...) {
#  require(DMwR)
#  out <- outliers.ranking(HD,ID,...)
#  list(score=out$prob.outliers,rank=out$rank.outliers)
# }
#
# load('artifData.Rdata')
# histData <- as.data.frame(data[1:500,]) # historical data of inspection
# inspData <- as.data.frame(data[1000:nrow(data),1:2]) # candidates for inspection
# rm <- learner('rpartXse',pars=list(se=0.5))
# om <- learner('orhCall',pars=list())
# urank <- UOR(histData,inspData,regrLearner=rm,outLearner=om) 
#
# ==============================================================

UOR <- function(HD,ID,
                regrLearner,
                outLearner,
                uf = function(x) x,
                descrCols=1:(ncol(HD)-3),costCol=ncol(HD)-2,benCol=ncol(HD)-1) {
  require(DMwR)

  # Step 1 - Train Cost and Benefit Prediction Models
  costData <- HD[,c(descrCols,costCol)]
  benefData <- HD[,c(descrCols,benCol)]

  costModel <- runLearner(regrLearner,as.formula(paste(names(HD)[costCol],'~.')),costData)
  benefitModel <- runLearner(regrLearner,as.formula(paste(names(HD)[benCol],'~.')),benefData)

  # Step 2 - Obtain Outlier Probabilities for Inspection
  out <- runLearner(outLearner,HD[,descrCols],ID)

  # Step 3 - Estimate Utilities
  EC <- predict(costModel,ID)
  EB <- predict(benefitModel,ID)

  # Step 4 - Obtain utility ranking (solution)
  EU <- out$score*uf(EB-EC) + (1-out$score)*uf(-EC)

  list(EU=EU,EUrank=order(EU,decreasing=T),EC=EC,EB=EB,outP=out)
}