# Unsupervised Machine Learning for Clustering in Political and Social Research
# Philip D. Waggoner, pdwaggoner@uchicago.edu

# Section 3

# Note: load packages from Section 1 first

# First, generate the distance matrix of Euclidean distances only for HAC
st_scale_dist <- st_scale %>% 
  dist(); head(st_scale_dist)


# Now, fit and visualize
hc_single <- hclust(st_scale_dist, 
                    method = "single")
single <- fviz_dend(hc_single, cex = 0.65, k = 2,
          color_labels_by_k = FALSE, rect = TRUE, 
          main = "")

hc_complete <- hclust(st_scale_dist, 
                    method = "single")
complete <- fviz_dend(hc_complete, cex = 0.65, k = 2,
          color_labels_by_k = FALSE, rect = TRUE, 
          main = "")

hc_average <- hclust(st_scale_dist, 
                      method = "single")
average <- fviz_dend(hc_average, cex = 0.65, k = 2, 
          color_labels_by_k = FALSE, rect = TRUE, 
          main = "")

hc_ward <- hclust(st_scale_dist, 
                     method = "ward.D2")
ward <- fviz_dend(hc_ward, cex = 0.65, k = 2, 
          color_labels_by_k = FALSE, rect = TRUE, 
          main = "")

grid.arrange(single, complete, 
             average, ward, 
             nrow = 2, ncol = 2)

# arrange by four inputs for California, New York, Illinois, Massachusetts, Pennsylvania, Ohio, and Michigan

head(st %>% # total session length
       arrange(desc(t_slength)), 10)

head(st %>% # regular session length
       arrange(desc(slength)), 10)

head(st %>% # salary
       arrange(desc(salary_real)), 10)

head(st %>% # expenditures
       arrange(desc(expend)), 10)

# Or triangular trees (sometimes called a "cladogram"):
par(mfrow = c(2,2))

plot(as.dendrogram(hc_single), 
     main = "Single",
     type = "triangle")

plot(as.dendrogram(hc_complete), 
     main = "Complete",
     type = "triangle")

plot(as.dendrogram(hc_average), 
     main = "Average",
     type = "triangle")

plot(as.dendrogram(hc_ward), 
     main = "Ward",
     type = "triangle")

par(mfrow = c(1,1))

## internal validation check
# first define a matrix
st_scale_dist_m <- st_scale_dist %>% 
  as.matrix(); head(st_scale_dist_m)

st_prof.internal <- clValid(st_scale_dist_m, 2:10, 
                         clMethods = c("hierarchical"), 
                         validation = "internal"); summary(st_prof.internal)

par(mfrow = c(2, 2))
plot(st_prof.internal, legend = FALSE,
     type = "l",
     main = " ")
par(mfrow=c(1,1))