Chapters
Other Information
|
###################################################
### Loading the data into R
###################################################
load('sales.Rdata')
library(DMwR)
data(sales)
head(sales)
###################################################
### Exploring the data set
###################################################
summary(sales)
nlevels(sales$ID)
nlevels(sales$Prod)
length(which(is.na(sales$Quant) & is.na(sales$Val)))
sum(is.na(sales$Quant) & is.na(sales$Val))
table(sales$Insp)/nrow(sales)*100
totS <- table(sales$ID)
totP <- table(sales$Prod)
barplot(totS,main='Transactions per salespeople',names.arg='',xlab='Salespeople',
ylab='Amount')
barplot(totP,main='Transactions per product',names.arg='',xlab='Products',
ylab='Amount')
sales$Uprice <- sales$Val/sales$Quant
summary(sales$Uprice)
attach(sales)
upp <- aggregate(Uprice,list(Prod),median,na.rm=T)
topP <- sapply(c(T,F),function(o)
upp[order(upp[,2],decreasing=o)[1:5],1])
colnames(topP) <- c('Expensive','Cheap')
topP
tops <- sales[Prod %in% topP[1,],c('Prod','Uprice')]
tops$Prod <- factor(tops$Prod)
boxplot(Uprice ~ Prod,data=tops,ylab='Uprice',log="y")
vs <- aggregate(Val,list(ID),sum,na.rm=T)
scoresSs <- sapply(c(T,F),function(o)
vs[order(vs$x,decreasing=o)[1:5],1])
colnames(scoresSs) <- c('Most','Least')
scoresSs
sum(vs[order(vs$x,decreasing=T)[1:100],2])/sum(Val,na.rm=T)*100
sum(vs[order(vs$x,decreasing=F)[1:2000],2])/sum(Val,na.rm=T)*100
qs <- aggregate(Quant,list(Prod),sum,na.rm=T)
scoresPs <- sapply(c(T,F),function(o)
qs[order(qs$x,decreasing=o)[1:5],1])
colnames(scoresPs) <- c('Most','Least')
scoresPs
sum(as.double(qs[order(qs$x,decreasing=T)[1:100],2]))/
sum(as.double(Quant),na.rm=T)*100
sum(as.double(qs[order(qs$x,decreasing=F)[1:4000],2]))/
sum(as.double(Quant),na.rm=T)*100
out <- tapply(Uprice,list(Prod=Prod),
function(x) length(boxplot.stats(x)$out))
out[order(out,decreasing=T)[1:10]]
sum(out)
sum(out)/nrow(sales)*100
###################################################
### Data problems
###################################################
totS <- table(ID)
totP <- table(Prod)
nas <- sales[which(is.na(Quant) & is.na(Val)),c('ID','Prod')]
propS <- 100*table(nas$ID)/totS
propS[order(propS,decreasing=T)[1:10]]
propP <- 100*table(nas$Prod)/totP
propP[order(propP,decreasing=T)[1:10]]
detach(sales)
sales <- sales[-which(is.na(sales$Quant) & is.na(sales$Val)),]
nnasQp <- tapply(sales$Quant,list(sales$Prod),
function(x) sum(is.na(x)))
propNAsQp <- nnasQp/table(sales$Prod)
propNAsQp[order(propNAsQp,decreasing=T)[1:10]]
sales <- sales[!sales$Prod %in% c('p2442','p2443'),]
nlevels(sales$Prod)
sales$Prod <- factor(sales$Prod)
nlevels(sales$Prod)
nnasQs <- tapply(sales$Quant,list(sales$ID),function(x) sum(is.na(x)))
propNAsQs <- nnasQs/table(sales$ID)
propNAsQs[order(propNAsQs,decreasing=T)[1:10]]
nnasVp <- tapply(sales$Val,list(sales$Prod),
function(x) sum(is.na(x)))
propNAsVp <- nnasVp/table(sales$Prod)
propNAsVp[order(propNAsVp,decreasing=T)[1:10]]
nnasVs <- tapply(sales$Val,list(sales$ID),function(x) sum(is.na(x)))
propNAsVs <- nnasVs/table(sales$ID)
propNAsVs[order(propNAsVs,decreasing=T)[1:10]]
tPrice <- tapply(sales[sales$Insp != 'fraud','Uprice'],
list(sales[sales$Insp != 'fraud','Prod']),
median,na.rm=T)
noQuant <- which(is.na(sales$Quant))
sales[noQuant,'Quant'] <- ceiling(sales[noQuant,'Val'] /
tPrice[sales[noQuant,'Prod']])
noVal <- which(is.na(sales$Val))
sales[noVal,'Val'] <- sales[noVal,'Quant'] *
tPrice[sales[noVal,'Prod']]
sales$Uprice <- sales$Val/sales$Quant
save(sales,file='salesClean.Rdata')
attach(sales)
notF <- which(Insp != 'fraud')
ms <- tapply(Uprice[notF],list(Prod=Prod[notF]),function(x) {
bp <- boxplot.stats(x)$stats
c(median=bp[3],iqr=bp[4]-bp[2])
})
ms <- matrix(unlist(ms),
length(ms),2,
byrow=T,dimnames=list(names(ms),c('median','iqr')))
head(ms)
par(mfrow=c(1,2))
plot(ms[,1],ms[,2],xlab='Median',ylab='IQR',main='')
plot(ms[,1],ms[,2],xlab='Median',ylab='IQR',main='',col='grey',log="xy")
smalls <- which(table(Prod) < 20)
points(log(ms[smalls,1]),log(ms[smalls,2]),pch='+')
dms <- scale(ms)
smalls <- which(table(Prod) < 20)
prods <- tapply(sales$Uprice,sales$Prod,list)
similar <- matrix(NA,length(smalls),7,dimnames=list(names(smalls),
c('Simil','ks.stat','ks.p','medP','iqrP','medS','iqrS')))
for(i in seq(along=smalls)) {
d <- scale(dms,dms[smalls[i],],FALSE)
d <- sqrt(drop(d^2 %*% rep(1,ncol(d))))
stat <- ks.test(prods[[smalls[i]]],prods[[order(d)[2]]])
similar[i,] <- c(order(d)[2],stat$statistic,stat$p.value,ms[smalls[i],],
ms[order(d)[2],])
}
head(similar)
levels(Prod)[similar[1,1]]
nrow(similar[similar[,'ks.p'] >= 0.9,])
sum(similar[,'ks.p'] >= 0.9)
save(similar,file='similarProducts.Rdata')
###################################################
### Evaluation criteria
###################################################
library(ROCR)
data(ROCR.simple)
pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels )
perf <- performance(pred,'prec','rec')
plot(perf)
PRcurve <- function(preds,trues,...) {
require(ROCR,quietly=T)
pd <- prediction(preds,trues)
pf <- performance(pd,'prec','rec')
pf@y.values <- lapply(pf@y.values,function(x) rev(cummax(rev(x))))
plot(pf,...)
}
PRcurve(ROCR.simple$predictions, ROCR.simple$labels )
pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels )
perf <- performance(pred,'lift','rpp')
plot(perf,main='Lift Chart')
CRchart <- function(preds,trues,...) {
require(ROCR,quietly=T)
pd <- prediction(preds,trues)
pf <- performance(pd,'rec','rpp')
plot(pf,...)
}
CRchart(ROCR.simple$predictions, ROCR.simple$labels,
main='Cumulative Recall Chart')
avgNDTP <- function(toInsp,train,stats) {
if (missing(train) && missing(stats))
stop('Provide either the training data or the product stats')
if (missing(stats)) {
notF <- which(train$Insp != 'fraud')
stats <- tapply(train$Uprice[notF],
list(Prod=train$Prod[notF]),
function(x) {
bp <- boxplot.stats(x)$stats
c(median=bp[3],iqr=bp[4]-bp[2])
})
stats <- matrix(unlist(stats),
length(stats),2,byrow=T,
dimnames=list(names(stats),c('median','iqr')))
stats[which(stats[,'iqr']==0),'iqr'] <-
stats[which(stats[,'iqr']==0),'median']
}
mdtp <- mean(abs(toInsp$Uprice-stats[toInsp$Prod,'median']) /
stats[toInsp$Prod,'iqr'])
return(mdtp)
}
###################################################
### Experimental Methodology
###################################################
evalOutlierRanking <- function(testSet,rankOrder,Threshold,statsProds) {
ordTS <- testSet[rankOrder,]
N <- nrow(testSet)
nF <- if (Threshold < 1) as.integer(Threshold*N) else Threshold
cm <- table(c(rep('fraud',nF),rep('ok',N-nF)),ordTS$Insp)
prec <- cm['fraud','fraud']/sum(cm['fraud',])
rec <- cm['fraud','fraud']/sum(cm[,'fraud'])
AVGndtp <- avgNDTP(ordTS[nF,],stats=statsProds)
return(c(Precision=prec,Recall=rec,avgNDTP=AVGndtp))
}
###################################################
### The modified box plot rule
###################################################
BPrule <- function(train,test) {
notF <- which(train$Insp != 'fraud')
ms <- tapply(train$Uprice[notF],list(Prod=train$Prod[notF]),
function(x) {
bp <- boxplot.stats(x)$stats
c(median=bp[3],iqr=bp[4]-bp[2])
})
ms <- matrix(unlist(ms),length(ms),2,byrow=T,
dimnames=list(names(ms),c('median','iqr')))
ms[which(ms[,'iqr']==0),'iqr'] <- ms[which(ms[,'iqr']==0),'median']
ORscore <- abs(test$Uprice-ms[test$Prod,'median']) /
ms[test$Prod,'iqr']
return(list(rankOrder=order(ORscore,decreasing=T),
rankScore=ORscore))
}
notF <- which(sales$Insp != 'fraud')
globalStats <- tapply(sales$Uprice[notF],
list(Prod=sales$Prod[notF]),
function(x) {
bp <- boxplot.stats(x)$stats
c(median=bp[3],iqr=bp[4]-bp[2])
})
globalStats <- matrix(unlist(globalStats),
length(globalStats),2,byrow=T,
dimnames=list(names(globalStats),c('median','iqr')))
globalStats[which(globalStats[,'iqr']==0),'iqr'] <-
globalStats[which(globalStats[,'iqr']==0),'median']
ho.BPrule <- function(form, train, test, ...) {
res <- BPrule(train,test)
structure(evalOutlierRanking(test,res$rankOrder,...),
itInfo=list(preds=res$rankScore,
trues=ifelse(test$Insp=='fraud',1,0)
)
)
}
bp.res <- holdOut(learner('ho.BPrule',
pars=list(Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(bp.res)
par(mfrow=c(1,2))
info <- attr(bp.res,'itsInfo')
PTs.bp <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.bp[,,1],PTs.bp[,,2],
main='PR curve',avg='vertical')
CRchart(PTs.bp[,,1],PTs.bp[,,2],
main='Cumulative Recall curve',avg='vertical')
###################################################
### Local outlier factors (LOF)
###################################################
ho.LOF <- function(form, train, test, k, ...) {
ntr <- nrow(train)
all <- rbind(train,test)
N <- nrow(all)
ups <- split(all$Uprice,all$Prod)
r <- list(length=ups)
for(u in seq(along=ups))
r[[u]] <- if (NROW(ups[[u]]) > 3)
lofactor(ups[[u]],min(k,NROW(ups[[u]]) %/% 2))
else if (NROW(ups[[u]])) rep(0,NROW(ups[[u]]))
else NULL
all$lof <- vector(length=N)
split(all$lof,all$Prod) <- r
all$lof[which(!(is.infinite(all$lof) | is.nan(all$lof)))] <-
SoftMax(all$lof[which(!(is.infinite(all$lof) | is.nan(all$lof)))])
structure(evalOutlierRanking(test,order(all[(ntr+1):N,'lof'],
decreasing=T),...),
itInfo=list(preds=all[(ntr+1):N,'lof'],
trues=ifelse(test$Insp=='fraud',1,0))
)
}
lof.res <- holdOut(learner('ho.LOF',
pars=list(k=7,Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(lof.res)
par(mfrow=c(1,2))
info <- attr(lof.res,'itsInfo')
PTs.lof <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.bp[,,1],PTs.bp[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
PRcurve(PTs.lof[,,1],PTs.lof[,,2],
add=T,lty=2,
avg='vertical')
legend('topright',c('BPrule','LOF'),lty=c(1,2))
CRchart(PTs.bp[,,1],PTs.bp[,,2],
main='Cumulative Recall curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
CRchart(PTs.lof[,,1],PTs.lof[,,2],
add=T,lty=2,
avg='vertical')
legend('bottomright',c('BPrule','LOF'),lty=c(1,2))
###################################################
### Clustering-based outlier rankings (OR_h)
###################################################
ho.ORh <- function(form, train, test, ...) {
require(dprep,quietly=T)
ntr <- nrow(train)
all <- rbind(train,test)
N <- nrow(all)
ups <- split(all$Uprice,all$Prod)
r <- list(length=ups)
for(u in seq(along=ups))
r[[u]] <- if (NROW(ups[[u]]) > 3)
outliers.ranking(ups[[u]])$prob.outliers
else if (NROW(ups[[u]])) rep(0,NROW(ups[[u]]))
else NULL
all$lof <- vector(length=N)
split(all$lof,all$Prod) <- r
all$lof[which(!(is.infinite(all$lof) | is.nan(all$lof)))] <-
SoftMax(all$lof[which(!(is.infinite(all$lof) | is.nan(all$lof)))])
structure(evalOutlierRanking(test,order(all[(ntr+1):N,'lof'],
decreasing=T),...),
itInfo=list(preds=all[(ntr+1):N,'lof'],
trues=ifelse(test$Insp=='fraud',1,0))
)
}
orh.res <- holdOut(learner('ho.ORh',
pars=list(Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(orh.res)
par(mfrow=c(1,2))
info <- attr(orh.res,'itsInfo')
PTs.orh <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.bp[,,1],PTs.bp[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
PRcurve(PTs.lof[,,1],PTs.lof[,,2],
add=T,lty=2,
avg='vertical')
PRcurve(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
legend('topright',c('BPrule','LOF','ORh'),
lty=c(1,2,1),col=c('black','black','grey'))
CRchart(PTs.bp[,,1],PTs.bp[,,2],
main='Cumulative Recall curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
CRchart(PTs.lof[,,1],PTs.lof[,,2],
add=T,lty=2,
avg='vertical')
CRchart(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
legend('bottomright',c('BPrule','LOF','ORh'),
lty=c(1,2,1),col=c('black','black','grey'))
###################################################
### The class imbalance problem
###################################################
data(iris)
data <- iris[,c(1,2,5)]
data$Species <- factor(ifelse(data$Species == 'setosa','rare','common'))
newData <- SMOTE(Species ~ .,data,perc.over=600)
table(newData$Species)
par(mfrow=c(1,2))
plot(data[,1],data[,2],pch=19+as.integer(data[,3]),main='Original Data')
plot(newData[,1],newData[,2],pch=19+as.integer(newData[,3]),main="SMOTE'd Data")
###################################################
### Naive Bayes
###################################################
nb <- function(train,test) {
require(e1071,quietly=T)
sup <- which(train$Insp != 'unkn')
data <- train[sup,c('ID','Prod','Uprice','Insp')]
data$Insp <- factor(data$Insp,levels=c('ok','fraud'))
model <- naiveBayes(Insp ~ .,data)
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')],type='raw')
return(list(rankOrder=order(preds[,'fraud'],decreasing=T),
rankScore=preds[,'fraud'])
)
}
ho.nb <- function(form, train, test, ...) {
res <- nb(train,test)
structure(evalOutlierRanking(test,res$rankOrder,...),
itInfo=list(preds=res$rankScore,
trues=ifelse(test$Insp=='fraud',1,0)
)
)
}
nb.res <- holdOut(learner('ho.nb',
pars=list(Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(nb.res)
par(mfrow=c(1,2))
info <- attr(nb.res,'itsInfo')
PTs.nb <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.nb[,,1],PTs.nb[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
PRcurve(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
legend('topright',c('NaiveBayes','ORh'),
lty=1,col=c('black','grey'))
CRchart(PTs.nb[,,1],PTs.nb[,,2],
main='Cumulative Recall curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
CRchart(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
legend('bottomright',c('NaiveBayes','ORh'),
lty=1,col=c('black','grey'))
nb.s <- function(train,test) {
require(e1071,quietly=T)
sup <- which(train$Insp != 'unkn')
data <- train[sup,c('ID','Prod','Uprice','Insp')]
data$Insp <- factor(data$Insp,levels=c('ok','fraud'))
newData <- SMOTE(Insp ~ .,data,perc.over=700)
model <- naiveBayes(Insp ~ .,newData)
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')],type='raw')
return(list(rankOrder=order(preds[,'fraud'],decreasing=T),
rankScore=preds[,'fraud'])
)
}
ho.nbs <- function(form, train, test, ...) {
res <- nb.s(train,test)
structure(evalOutlierRanking(test,res$rankOrder,...),
itInfo=list(preds=res$rankScore,
trues=ifelse(test$Insp=='fraud',1,0)
)
)
}
nbs.res <- holdOut(learner('ho.nbs',
pars=list(Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(nbs.res)
par(mfrow=c(1,2))
info <- attr(nbs.res,'itsInfo')
PTs.nbs <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.nb[,,1],PTs.nb[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
PRcurve(PTs.nbs[,,1],PTs.nbs[,,2],
add=T,lty=2,
avg='vertical')
PRcurve(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
legend('topright',c('NaiveBayes','smoteNaiveBayes','ORh'),
lty=c(1,2,1),col=c('black','black','grey'))
CRchart(PTs.nb[,,1],PTs.nb[,,2],
main='Cumulative Recall curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
CRchart(PTs.nbs[,,1],PTs.nbs[,,2],
add=T,lty=2,
avg='vertical')
CRchart(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
legend('bottomright',c('NaiveBayes','smoteNaiveBayes','ORh'),
lty=c(1,2,1),col=c('black','black','grey'))
###################################################
### AdaBoost
###################################################
library(RWeka)
WOW(AdaBoostM1)
data(iris)
idx <- sample(150,100)
model <- AdaBoostM1(Species ~ .,iris[idx,],
control=Weka_control(I=100))
preds <- predict(model,iris[-idx,])
head(preds)
table(preds,iris[-idx,'Species'])
prob.preds <- predict(model,iris[-idx,],type='probability')
head(prob.preds)
ab <- function(train,test) {
require(RWeka,quietly=T)
sup <- which(train$Insp != 'unkn')
data <- train[sup,c('ID','Prod','Uprice','Insp')]
data$Insp <- factor(data$Insp,levels=c('ok','fraud'))
model <- AdaBoostM1(Insp ~ .,data,
control=Weka_control(I=100))
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')],
type='probability')
return(list(rankOrder=order(preds[,'fraud'],decreasing=T),
rankScore=preds[,'fraud'])
)
}
ho.ab <- function(form, train, test, ...) {
res <- ab(train,test)
structure(evalOutlierRanking(test,res$rankOrder,...),
itInfo=list(preds=res$rankScore,
trues=ifelse(test$Insp=='fraud',1,0)
)
)
}
ab.res <- holdOut(learner('ho.ab',
pars=list(Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(ab.res)
par(mfrow=c(1,2))
info <- attr(ab.res,'itsInfo')
PTs.ab <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.nb[,,1],PTs.nb[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
PRcurve(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
PRcurve(PTs.ab[,,1],PTs.ab[,,2],
add=T,lty=2,
avg='vertical')
legend('topright',c('NaiveBayes','ORh','AdaBoostM1'),
lty=c(1,1,2),col=c('black','grey','black'))
CRchart(PTs.nb[,,1],PTs.nb[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
CRchart(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
CRchart(PTs.ab[,,1],PTs.ab[,,2],
add=T,lty=2,
avg='vertical')
legend('bottomright',c('NaiveBayes','ORh','AdaBoostM1'),
lty=c(1,1,2),col=c('black','grey','black'))
###################################################
### Semi-supervised approaches
###################################################
set.seed(1234) # Just to ensrure you get the same results as in the book
library(DMwR)
library(e1071)
data(iris)
idx <- sample(150,100)
tr <- iris[idx,]
ts <- iris[-idx,]
nb <- naiveBayes(Species ~ .,tr)
table(predict(nb,ts),ts$Species)
trST <- tr
nas <- sample(100,90)
trST[nas,'Species'] <- NA
func <- function(m,d) {
p <- predict(m,d,type='raw')
data.frame(cl=colnames(p)[apply(p,1,which.max)],p=apply(p,1,max))
}
nbSTbase <- naiveBayes(Species ~ .,trST[-nas,])
table(predict(nbSTbase,ts),ts$Species)
nbST <- SelfTrain(Species ~ .,trST,learner('naiveBayes',list()),'func')
table(predict(nbST,ts),ts$Species)
pred.nb <- function(m,d) {
p <- predict(m,d,type='raw')
data.frame(cl=colnames(p)[apply(p,1,which.max)],
p=apply(p,1,max)
)
}
nb.st <- function(train,test) {
require(e1071,quietly=T)
train <- train[,c('ID','Prod','Uprice','Insp')]
train[which(train$Insp == 'unkn'),'Insp'] <- NA
train$Insp <- factor(train$Insp,levels=c('ok','fraud'))
model <- SelfTrain(Insp ~ .,train,
learner('naiveBayes',list()),'pred.nb')
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')],
type='raw')
return(list(rankOrder=order(preds[,'fraud'],decreasing=T),
rankScore=preds[,'fraud'])
)
}
ho.nb.st <- function(form, train, test, ...) {
res <- nb.st(train,test)
structure(evalOutlierRanking(test,res$rankOrder,...),
itInfo=list(preds=res$rankScore,
trues=ifelse(test$Insp=='fraud',1,0)
)
)
}
nb.st.res <- holdOut(learner('ho.nb.st',
pars=list(Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(nb.st.res)
par(mfrow=c(1,2))
info <- attr(nb.st.res,'itsInfo')
PTs.nb.st <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.nb[,,1],PTs.nb[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
PRcurve(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
PRcurve(PTs.nb.st[,,1],PTs.nb.st[,,2],
add=T,lty=2,
avg='vertical')
legend('topright',c('NaiveBayes','ORh','NaiveBayes-ST'),
lty=c(1,1,2),col=c('black','grey','black'))
CRchart(PTs.nb[,,1],PTs.nb[,,2],
main='Cumulative Recall curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
CRchart(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
CRchart(PTs.nb.st[,,1],PTs.nb.st[,,2],
add=T,lty=2,
avg='vertical')
legend('bottomright',c('NaiveBayes','ORh','NaiveBayes-ST'),
lty=c(1,1,2),col=c('black','grey','black'))
pred.ada <- function(m,d) {
p <- predict(m,d,type='probability')
data.frame(cl=colnames(p)[apply(p,1,which.max)],
p=apply(p,1,max)
)
}
ab.st <- function(train,test) {
require(RWeka,quietly=T)
train <- train[,c('ID','Prod','Uprice','Insp')]
train[which(train$Insp == 'unkn'),'Insp'] <- NA
train$Insp <- factor(train$Insp,levels=c('ok','fraud'))
model <- SelfTrain(Insp ~ .,train,
learner('AdaBoostM1',
list(control=Weka_control(I=100))),
'pred.ada')
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')],
type='probability')
return(list(rankOrder=order(preds[,'fraud'],decreasing=T),
rankScore=preds[,'fraud'])
)
}
ho.ab.st <- function(form, train, test, ...) {
res <- ab.st(train,test)
structure(evalOutlierRanking(test,res$rankOrder,...),
itInfo=list(preds=res$rankScore,
trues=ifelse(test$Insp=='fraud',1,0)
)
)
}
ab.st.res <- holdOut(learner('ho.ab.st',
pars=list(Threshold=0.1,
statsProds=globalStats)),
dataset(Insp ~ .,sales),
hldSettings(3,0.3,1234,T),
itsInfo=TRUE
)
summary(ab.st.res)
par(mfrow=c(1,2))
info <- attr(ab.st.res,'itsInfo')
PTs.ab.st <- aperm(array(unlist(info),dim=c(length(info[[1]]),2,3)),
c(1,3,2)
)
PRcurve(PTs.ab[,,1],PTs.ab[,,2],
main='PR curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
PRcurve(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
PRcurve(PTs.ab.st[,,1],PTs.ab.st[,,2],
add=T,lty=2,
avg='vertical')
legend('topright',c('AdaBoostM1','ORh','AdaBoostM1-ST'),
lty=c(1,1,2),col=c('black','grey','black'))
CRchart(PTs.ab[,,1],PTs.ab[,,2],
main='Cumulative Recall curve',lty=1,xlim=c(0,1),ylim=c(0,1),
avg='vertical')
CRchart(PTs.orh[,,1],PTs.orh[,,2],
add=T,lty=1,col='grey',
avg='vertical')
CRchart(PTs.ab.st[,,1],PTs.ab.st[,,2],
add=T,lty=2,
avg='vertical')
legend('bottomright',c('AdaBoostM1','ORh','AdaBoostM1-ST'),
lty=c(1,1,2),col=c('black','grey','black'))
|