################################################################# # THIS FILE CONTAINS THE IMPLEMENTATION OF THE "UTILITY-BASED # # OUTLIER RANKINGS" DESCRIBED IN THE PAPER: # # Torgo,L and Lopes,E.: Utility-based Fraud Detection. In # # Proceedings of the 22nd IJCAI'2011, p.1517-1522, AAAI Press.# # # # NOTE: If you do you this code please cite the above work # ################################################################# # Author : Luis Torgo (ltorgo@dcc.fc.up.pt) Date: Mar 2011 # # License: GPL (>= 2) # ################################################################# # ============================================================== # The function implementing the proposed algorithm (Algorithm 1) # -------------------------------------------------------------- # Parameters: # HD - a data frame containing the historical data (data containing # information on the costs and benefits. # ID - a data frame containing the data for which we want to obtain # an utility-based inspection ranking. # regrLearner - the name of an R function that implements a regression # algorithm that is to be used for forecasting the # costs and benefits of new cases. # outLearner - the name of an R function that implements an outlier # ranker. # uf - an utility function. # descrCols - a vector with the columns of the HD and ID data frames # that describe each observation. # costCol - the column of the HD data frame that contains the cost # information. # benCol - the column of the HD data frame that contains the benefit # information. # # Output: # The result of this function is a list with five components: # EU - a vector with as many positions as there are cases to rank, # were position i of the vector contains the expected utility value. # EUrank - a vector with as many positions as there are cases to rank, # were contains the indexes ranked by decreasing EU. # EC - a vector with as many positions as there are cases to rank, where # the vector contains the estimated inspection cost of each observation. # EB - a vector with as many positions as there are cases to rank, where # the vector contains the estimated benefit (payoff) of each observation # if confirmed fraudulent. # outP - a list with two components. Component rank contains a vector # with as many positions as there are cases to rank, where # position i of the vector contains the rank order of the # observation i. Component score is another vector with the # same size this time containing the estimated probability of each # observation being a fraud. # # Example use: # library(DMwR) # orhCall <- function(HD,ID,...) { # require(DMwR) # out <- outliers.ranking(HD,ID,...) # list(score=out$prob.outliers,rank=out$rank.outliers) # } # # load('artifData.Rdata') # histData <- as.data.frame(data[1:500,]) # historical data of inspection # inspData <- as.data.frame(data[1000:nrow(data),1:2]) # candidates for inspection # rm <- learner('rpartXse',pars=list(se=0.5)) # om <- learner('orhCall',pars=list()) # urank <- UOR(histData,inspData,regrLearner=rm,outLearner=om) # # ============================================================== UOR <- function(HD,ID, regrLearner, outLearner, uf = function(x) x, descrCols=1:(ncol(HD)-3),costCol=ncol(HD)-2,benCol=ncol(HD)-1) { require(DMwR) # Step 1 - Train Cost and Benefit Prediction Models costData <- HD[,c(descrCols,costCol)] benefData <- HD[,c(descrCols,benCol)] costModel <- runLearner(regrLearner,as.formula(paste(names(HD)[costCol],'~.')),costData) benefitModel <- runLearner(regrLearner,as.formula(paste(names(HD)[benCol],'~.')),benefData) # Step 2 - Obtain Outlier Probabilities for Inspection out <- runLearner(outLearner,HD[,descrCols],ID) # Step 3 - Estimate Utilities EC <- predict(costModel,ID) EB <- predict(benefitModel,ID) # Step 4 - Obtain utility ranking (solution) EU <- out$score*uf(EB-EC) + (1-out$score)*uf(-EC) list(EU=EU,EUrank=order(EU,decreasing=T),EC=EC,EB=EB,outP=out) }