################################################################# # THIS FILE CONTAINS THE CODE NECESSARY TO GENERATE THE FILE # # 'artifData.Rdata' THAT CONTAINS THE ARTIFICIAL DATA SET USED # # THE PAPER: # # Torgo,L and Lopes,E.: Utility-based Fraud Detection. In # # Proceedings of the 22nd IJCAI'2011 # ################################################################# # Author : Luis Torgo (ltorgo@dcc.fc.up.pt) Date: Mar 2011 # # License: GPL (>= 2) # ################################################################# # ========================================= # Function for generating the artificial data # ========================================= genData <- function(n,percO,seed=NULL) { # Parameters Cluster 1 m1 <- 1000 # mean value s1 <- 50 # standard deviation # Parameters Cluster 2 m2 <- 50 # mean value s2 <- 10 # standard deviation if (!is.null(seed)) set.seed(seed) # Generate Cluster 1 meanClust <- m1 sdClust <- s1 x <- c(rnorm(n,mean=meanClust,sd=sdClust+10), rnorm(n,mean=meanClust,sd=sdClust-10)) data <- cbind(x[sample(2*n,n,repl=T)],x[sample(2*n,n,repl=T)]) x <- c(rnorm(n,mean=meanClust+3*sdClust,sd=sdClust+10), rnorm(n,mean=meanClust-3*sdClust,sd=sdClust+10), rnorm(n,mean=meanClust+3*sdClust,sd=sdClust-10), rnorm(n,mean=meanClust-3*sdClust,sd=sdClust-10)) data <- rbind(data, cbind(x[sample(4*n,as.integer(n*percO),repl=T)], x[sample(4*n,as.integer(n*percO),repl=T)])) # Generate Cluster 2 meanClust <- m2 sdClust <- s2 x <- c(rnorm(n,mean=meanClust,sd=sdClust+5), rnorm(n,mean=meanClust,sd=sdClust-5)) data <- rbind(data, cbind(x[sample(2*n,n,repl=T)],x[sample(2*n,n,repl=T)])) x <- c(rnorm(n,mean=meanClust+9*sdClust,sd=sdClust+5), rnorm(n,mean=meanClust-9*sdClust,sd=sdClust+5), rnorm(n,mean=meanClust+9*sdClust,sd=sdClust-5), rnorm(n,mean=meanClust-9*sdClust,sd=sdClust-5)) data <- rbind(data, cbind(x[sample(4*n,as.integer(n*percO),repl=T)], x[sample(4*n,as.integer(n*percO),repl=T)])) } # ========================================= # Function for adding the cost, benefit and utility columns # ========================================= utilFunc <- function(dat) { cf <- function(x) ifelse(x[1] < 500, ifelse(x[2]<500,50+x[1]*1.5+x[2]*1.2,60+x[1]*1.6+x[2]*2), ifelse(x[2]<500,100+x[1]*2+x[2]*3,500+x[1]*4+x[2]*5)) bf <- function(x) ifelse(x[1] < 500, ifelse(x[2]<500,57+x[1]*1.6+x[2]*1.3,10+x[1]*1.65+x[2]*2.1), ifelse(x[2]<500,300+x[1]*4+x[2]*3.2,5000+x[1]*6+x[2]*10)) c <- apply(dat,1,cf) b <- apply(dat,1,bf) dados <- cbind(dat,cost=c,benef=b,util=b-c) } ############################################################### # Now let us use the function to create the data set ############################################################### # First generate the data data <- genData(1000,0.1,123) # a few added by hand using # > byHand <- locator() x <- c(78.71168, 334.56425, 421.72281, 1012.15181, 750.67611, 19.66878) y <- c(299.898080, 228.558188, 1.567623, 724.694709, 1045.724222 , 397.179750) data <- rbind(data,cbind(x,y)) colnames(data) <- c('x1','x2') # Now lets add the cost, benefit and utility columns data <- utilFunc(data) # Now we will put to zero the benefit of the "normal" cases # and re-calculate their utility # for this small artificial example we will set a case as normal if: # (0 < x1 < 100 AND 0 < x2 < 100) OR (900 < x1 < 1100 AND 900 < x2 < 1100) normal <- which((data[,1] > 0 & data[,1] < 100 & data[,2] > 0 & data[,2] < 100) | (data[,1] > 900 & data[,1] < 1100 & data[,2] > 900 & data[,2] < 1100)) data[normal,'benef'] <- 0 data[normal,'util'] <- -data[normal,'cost'] # ========================================= # Finally, lets save the artificial data set # ========================================= save(data,file='artifData.Rdata')