################################################################# 
# THIS FILE CONTAINS THE CODE NECESSARY TO GENERATE THE FILE    #
# 'artifData.Rdata' THAT CONTAINS THE ARTIFICIAL DATA SET USED  #
# THE PAPER:                                                    #
# Torgo,L and Lopes,E.: Utility-based Fraud Detection. In       #
#   Proceedings of the 22nd IJCAI'2011                          #
#################################################################
# Author : Luis Torgo (ltorgo@dcc.fc.up.pt)      Date: Mar 2011 #
# License: GPL (>= 2)                                           #
#################################################################


# =========================================
# Function for generating the artificial data
# =========================================

genData <- function(n,percO,seed=NULL) {
  # Parameters Cluster 1
  m1 <- 1000  # mean value
  s1 <- 50    # standard deviation
  # Parameters Cluster 2
  m2 <- 50    # mean value
  s2 <- 10    # standard deviation

  if (!is.null(seed)) set.seed(seed)

  # Generate Cluster 1
  meanClust <- m1
  sdClust <- s1
  x <- c(rnorm(n,mean=meanClust,sd=sdClust+10),
         rnorm(n,mean=meanClust,sd=sdClust-10))
  data <- cbind(x[sample(2*n,n,repl=T)],x[sample(2*n,n,repl=T)])
  x <- c(rnorm(n,mean=meanClust+3*sdClust,sd=sdClust+10),
         rnorm(n,mean=meanClust-3*sdClust,sd=sdClust+10),
         rnorm(n,mean=meanClust+3*sdClust,sd=sdClust-10),
         rnorm(n,mean=meanClust-3*sdClust,sd=sdClust-10))
  data <- rbind(data,
                 cbind(x[sample(4*n,as.integer(n*percO),repl=T)],
                       x[sample(4*n,as.integer(n*percO),repl=T)]))
  # Generate Cluster 2
  meanClust <- m2
  sdClust <- s2
  x <- c(rnorm(n,mean=meanClust,sd=sdClust+5),
         rnorm(n,mean=meanClust,sd=sdClust-5))
  data <- rbind(data,
                 cbind(x[sample(2*n,n,repl=T)],x[sample(2*n,n,repl=T)]))
  x <- c(rnorm(n,mean=meanClust+9*sdClust,sd=sdClust+5),
         rnorm(n,mean=meanClust-9*sdClust,sd=sdClust+5),
         rnorm(n,mean=meanClust+9*sdClust,sd=sdClust-5),
         rnorm(n,mean=meanClust-9*sdClust,sd=sdClust-5))
  data <- rbind(data,
                 cbind(x[sample(4*n,as.integer(n*percO),repl=T)],
                       x[sample(4*n,as.integer(n*percO),repl=T)]))

}


# =========================================
# Function for adding the cost, benefit and utility columns
# =========================================

utilFunc <- function(dat) {
  cf <- function(x) ifelse(x[1] < 500,
                           ifelse(x[2]<500,50+x[1]*1.5+x[2]*1.2,60+x[1]*1.6+x[2]*2),
                           ifelse(x[2]<500,100+x[1]*2+x[2]*3,500+x[1]*4+x[2]*5))
  
  bf <- function(x) ifelse(x[1] < 500,
                           ifelse(x[2]<500,57+x[1]*1.6+x[2]*1.3,10+x[1]*1.65+x[2]*2.1),
                           ifelse(x[2]<500,300+x[1]*4+x[2]*3.2,5000+x[1]*6+x[2]*10))
  
  c <- apply(dat,1,cf)
  b <- apply(dat,1,bf)
  dados <- cbind(dat,cost=c,benef=b,util=b-c)
}


###############################################################
# Now let us use the function to create the data set
###############################################################


# First generate the data
data <- genData(1000,0.1,123)
# a few added by hand using
# > byHand <- locator()
x <- c(78.71168,  334.56425,  421.72281, 1012.15181,  750.67611,   19.66878)
y <- c(299.898080,  228.558188,    1.567623,  724.694709, 1045.724222 , 397.179750)
data <- rbind(data,cbind(x,y))
colnames(data) <- c('x1','x2')

# Now lets add the cost, benefit and utility columns
data <- utilFunc(data)


# Now we will put to zero the benefit of the "normal" cases
# and re-calculate their utility

# for this small artificial example we will set a case as normal if:
# (0 < x1 < 100 AND 0 < x2 < 100) OR (900 < x1 < 1100 AND 900 < x2 < 1100)
normal <- which((data[,1] > 0 & data[,1] < 100 & data[,2] > 0 & data[,2] < 100) | (data[,1] > 900 & data[,1] < 1100 & data[,2] > 900 & data[,2] < 1100))
data[normal,'benef'] <- 0
data[normal,'util'] <- -data[normal,'cost']


# =========================================
# Finally, lets save the artificial data set
# =========================================

save(data,file='artifData.Rdata')