Notes!

1. Apply the k-means algorithm, and find the optimal number of clusters

library(DMwR)
## Loading required package: lattice
## Loading required package: grid
data(iris)

tr <- iris[,-5]

library(cluster)

dists <- dist(tr)
avgS <- c()

for(k in 2:10) {
  cl <- kmeans(tr,centers=k,iter.max=300)
  s <- silhouette(cl$cluster,dists)
  avgS <- c(avgS, mean(s[,3]))
}

res <- data.frame(nClusters=2:10,Silhouette=avgS)
res[order(res$Silhouette,decreasing = TRUE),]
##   nClusters Silhouette
## 1         2  0.6810462
## 2         3  0.5528190
## 4         5  0.4912400
## 6         7  0.4416784
## 7         8  0.4353529
## 9        10  0.4312998
## 3         4  0.4104276
## 5         6  0.3648340
## 8         9  0.3211527

2. Using the best k, apply the PAM, CLARA, DBSCAN, FANNY, Agglomerative Hierarchical Clustering (hclust()) and DIANA algorithms

# PAM
m.pam <- pam(tr, k=2)

# CLARA
m.clara <- clara(tr, k=2)

# DBSCAN
library(fpc)
d <- scale(tr)
m.dbscan <- dbscan(d,0.9)

# FANNY
m.fanny <- fanny(tr, k=2)

# Agglomerative Hierarchical Clustering
methds <- c("complete","single","average")
m.hclust.complete <- hclust(dists, meth=methds[1])
m.hclust.single <- hclust(dists, meth=methds[2])
m.hclust.average <- hclust(dists, meth=methds[3])

# DIANA
m.diana <- diana(tr)

3. Compare all results using the silhouette coefficient

res.df <- data.frame(Method=c("PAM","CLARA","DBSCAN","FANNY","HC.Complete","HC.Single","HC.Average","DIANA"),
                     Silhouette=c(mean(silhouette(m.pam,dists)[,3]),
                                  mean(silhouette(m.clara,dists)[,3]),
                                  mean(silhouette(m.dbscan$cluster,dists)[,3]),
                                  mean(silhouette(m.fanny,dists)[,3]),
                                  mean(silhouette(cutree(m.hclust.complete,3),dists)[,3]),
                                  mean(silhouette(cutree(m.hclust.single,3),dists)[,3]),
                                  mean(silhouette(cutree(m.hclust.average,3),dists)[,3]),
                                  mean(silhouette(cutree(m.diana,3),dists)[,3])))
## Warning in if (!full) return(NextMethod()): the condition has length > 1 and
## only the first element will be used
library(ggplot2)

ggplot(res.df,aes(x=Method,y=Silhouette)) + geom_bar(stat="identity")