# import needed libraries
library(factoextra) # for visualizing
library(tidymodels) # for tidy(), glance(), and augment()

# load in the data (numerical columns)
data <- iris[,1:4]

# run kmeans and make predictions
km <- kmeans(data, centers = 3)

# see a high-level summary of the clustering
summary(km)

# to see clusters next to original data
cbind(iris, km$cluster)  # manually adds the cluster labels to the original data
augment(km, iris)        # tidymodels-style: adds a `.cluster` column

# get cluster centers in tidy format (one row per cluster, one column per feature)
tidy(km)

# get model-level summary statistics (total within-cluster SS, etc.)
glance(km)

# visualize the clusters (notice that some of the clusters overlap - why?)
fviz_cluster(km, data)

# try clustering with different numbers of clusters to explore optimal k
kclusts <- tibble(k = 1:9) |>                # create a tibble of different k values (1 to 9)
  mutate(
    kclust = map(k, ~kmeans(data, .x)),      # run k-means for each value of k
    tidied = map(kclust, tidy),              # extract cluster centers (tidy format)
    glanced = map(kclust, glance),           # extract overall summary statistics
    augmented = map(kclust, augment, data)
  )

# view the tibble, where each row corresponds to a different value of k
kclusts

# unnest the data for easier plotting and inspection

# expands the cluster centers into rows (per k and per cluster)
clusters <- kclusts |>
  unnest(cols = c(tidied))

# expands the full data with .cluster for each row and each k
assignments <- kclusts |>
  unnest(cols = c(augmented))

# expands summary stats for each value of k (like tot.withinss)
clusterings <- kclusts |>
  unnest(cols = c(glanced))

# visualize how cluster assignments change with different values of k
ggplot(data = assignments, aes(x = Sepal.Length, y = Petal.Length)) +
  geom_point(aes(color = .cluster), alpha = 0.8) + 
  facet_wrap(~ k) +
  geom_point(data = clusters, size = 10, shape = "x")

# plot total within-cluster sum of squares for each k to find the "elbow"
ggplot(clusterings, aes(k, tot.withinss)) +
  geom_line() +
  geom_point()

# an alternative approach to kmeans: hierarchical clustering
hclust <- hkmeans(data, k = 3)

# visualize with a dendogram
fviz_dend(hclust)