This section contains code for clustering data in R. The code assumes that the data being used has already been prepped for clustering.
Hierarchical Clustering (hclust())
Below is the code used for hierarchical clustering using the hclust() function in R. The example code uses text data that has been vectorized. However, any prepped record data can also be used. The code file and example dataset used can also be downloaded below.
library(stats)
# clustering libraries
library(NbClust)
library(cluster)
library(mclust)
library(amap) ## for using Kmeans (notice the cap K)
library(factoextra) ## for cluster vis, silhouette, etc.
library(purrr)
library(stylo) ## for dist.cosine
library(philentropy) ## for distance() which offers 46 metrics
library(SnowballC)
library(caTools)
library(dplyr)
library(textstem)
library(stringr)
library(wordcloud)
library(tm) ## to read in corpus (text data)
library(dplyr)
# ---------------------------------------------------------------------------- #
## LOAD DATA
## Must be numeric and unlabeled
# Keep row names for labeling in the dendrogram if labels exist already
My_articles_m <- as.matrix(read.csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Machine_Learning/Clustering/prepped_data/articles_prepped_for_cluster.csv', row.names = 1))
nrow(My_articles_m)
# ---------------------------------------------------------------------------- #
## Hierarchical Clustering of .txt Articles
## COsine Sim will be the distance metric used due to high dimensions
## a * b / (||a|| * ||b||)
CosineSim <- My_articles_m / sqrt(rowSums(My_articles_m * My_articles_m))
CosineSim <- CosineSim %*% t(CosineSim)
#Convert to distance metric
D_Cos_Sim <- as.dist(1-CosineSim)
# Clustering using hclust()
HClust_Ward_CosSim_SmallCorp2 <- hclust(D_Cos_Sim, method="ward.D2")
# Plots a dendrogram of the clustering results
plot(HClust_Ward_CosSim_SmallCorp2, cex=.7, hang=-11,main = "Cosine Sim")
# Plots rectangles to separate clusters for the specified number of clusters 'k'
rect.hclust(HClust_Ward_CosSim_SmallCorp2, k=3)
K-Means
Below is the code for clustering using K-Means in R. The code file and datasets used can be downloaded below. Note: the articles dataset does not cluster well using K-Means, the games dataset clusters better.
library(stats)
# clustering libraries
library(NbClust)
library(cluster)
library(mclust)
library(amap) ## for using Kmeans (notice the cap K)
library(factoextra) ## for cluster vis, silhouette, etc.
library(purrr)
library(stylo) ## for dist.cosine
library(philentropy) ## for distance() which offers 46 metrics
library(SnowballC)
library(caTools)
library(dplyr)
library(textstem)
library(stringr)
library(wordcloud)
library(tm) ## to read in corpus (text data)
library(dplyr)
# ---------------------------------------------------------------------------- #
## LOAD DATA
## Must be numeric and unlabeled
# Keep row names for labeling in the visualizations if labels exist already
articles <- read.csv('C:/Users/casey/OneDrive/Documents/MSDS_Courses/Spring_2023/Machine_Learning/Clustering/prepped_data/articles_prepped_for_cluster.csv', row.names = 1)
nrow(articles)
# --------------------------------------------------------------------------- #
## SILHOUETTE (find optimal value of K)
fviz_nbclust(articles, method = "silhouette",
FUN = hcut, k.max = 5)
# --------------------------------------------------------------------------- #
## k-MEANS CLUSTERING
kmeans_result <- kmeans(articles, 2, nstart=25)
# Print the results
print(kmeans_result)
kmeans_result$centers
## Place results in a table with the original data
cbind(articles, cluster = kmeans_result$cluster)
## See each cluster
kmeans_result$cluster
## This is the size (the number of points in) each cluster
# Cluster size
kmeans_result$size
## Here we have two clusters, each with 5 points (rows/vectors)
## Visualize the clusters
fviz_cluster(kmeans_result, SmallCorpus_DF_DT,
main="Euclidean", repel = TRUE)
# --------------------------------------------------------------------------- #
## K-MEANS CLUSTERING
## To cluster words use transpose t()
## To cluster articles don't use transpose
## k = 2
My_Kmeans_SmallCorp2<-Kmeans(articles, centers=2 ,method = "euclidean")
fviz_cluster(My_Kmeans_SmallCorp2, articles, main="Euclidean k=3",repel = TRUE)
## k = 2
My_Kmeans_SmallCorp3<-Kmeans(t(articles), centers=2 ,method = "spearman")
fviz_cluster(My_Kmeans_SmallCorp3, t(articles), main="Spearman", repel = TRUE)
## k = 2
My_Kmeans_SmallCorp4<-Kmeans(articles, centers=2 ,method = "spearman")
fviz_cluster(My_Kmeans_SmallCorp4, articles, main="Spearman", repel = TRUE)
## k = 2 and different metric
My_Kmeans_SmallCorp4<-Kmeans(t(SmallCorpus_DF_DT), centers=2 ,method = "manhattan")
fviz_cluster(My_Kmeans_SmallCorp4, t(SmallCorpus_DF_DT), main="manhattan", repel = TRUE)
## k = 2 and different metric
My_Kmeans_SmallCorp5<-Kmeans(t(SmallCorpus_DF_DT), centers=2 ,method = "canberra")
fviz_cluster(My_Kmeans_SmallCorp5, t(SmallCorpus_DF_DT), main="canberra", repel = TRUE)