Notes!
- Using the
iris
data set, answer the following questions:
- Store all the clustering results, and remember to ignore the target variables (
Species
)!
1. Apply the k-means algorithm, and find the optimal number of clusters
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
data(iris)
tr <- iris[,-5]
library(cluster)
dists <- dist(tr)
avgS <- c()
for(k in 2:10) {
cl <- kmeans(tr,centers=k,iter.max=300)
s <- silhouette(cl$cluster,dists)
avgS <- c(avgS, mean(s[,3]))
}
res <- data.frame(nClusters=2:10,Silhouette=avgS)
res[order(res$Silhouette,decreasing = TRUE),]
## nClusters Silhouette
## 1 2 0.6810462
## 2 3 0.5528190
## 4 5 0.4912400
## 6 7 0.4416784
## 7 8 0.4353529
## 9 10 0.4312998
## 3 4 0.4104276
## 5 6 0.3648340
## 8 9 0.3211527
2. Using the best k, apply the PAM, CLARA, DBSCAN, FANNY, Agglomerative Hierarchical Clustering (hclust()
) and DIANA algorithms
# PAM
m.pam <- pam(tr, k=2)
# CLARA
m.clara <- clara(tr, k=2)
# DBSCAN
library(fpc)
d <- scale(tr)
m.dbscan <- dbscan(d,0.9)
# FANNY
m.fanny <- fanny(tr, k=2)
# Agglomerative Hierarchical Clustering
methds <- c("complete","single","average")
m.hclust.complete <- hclust(dists, meth=methds[1])
m.hclust.single <- hclust(dists, meth=methds[2])
m.hclust.average <- hclust(dists, meth=methds[3])
# DIANA
m.diana <- diana(tr)
3. Compare all results using the silhouette coefficient
res.df <- data.frame(Method=c("PAM","CLARA","DBSCAN","FANNY","HC.Complete","HC.Single","HC.Average","DIANA"),
Silhouette=c(mean(silhouette(m.pam,dists)[,3]),
mean(silhouette(m.clara,dists)[,3]),
mean(silhouette(m.dbscan$cluster,dists)[,3]),
mean(silhouette(m.fanny,dists)[,3]),
mean(silhouette(cutree(m.hclust.complete,3),dists)[,3]),
mean(silhouette(cutree(m.hclust.single,3),dists)[,3]),
mean(silhouette(cutree(m.hclust.average,3),dists)[,3]),
mean(silhouette(cutree(m.diana,3),dists)[,3])))
## Warning in if (!full) return(NextMethod()): the condition has length > 1 and
## only the first element will be used
library(ggplot2)
ggplot(res.df,aes(x=Method,y=Silhouette)) + geom_bar(stat="identity")
