# Unsupervised Machine Learning for Clustering in Political and Social Research
# Philip D. Waggoner, pdwaggoner@uchicago.edu

# Section 4

# Note: load packages from Section 1 first

# Fit the algorithm
set.seed(634)

km <- kmeans(st_scale, 
                 centers = 2, 
                 nstart = 25)

# Call structure of the km object
str(km)

# Call individual values, e.g.,
km$cluster
km$centers
km$withinss

# Assess/inspect our kmeans object
t <- as.table(km$cluster)
t <- data.frame(t)
head(t, 10)

# Visualize
st_scale$Cluster <- as.factor(km$cluster) # save clusters in df
table(st_scale$Cluster)

# Expenditures x Salaries
es <- ggplot(st_scale, aes(salary_real, expend, 
             color = Cluster, 
             label = states)) +
  geom_jitter() +
  geom_label(aes(label = states, 
                 color = Cluster), 
             size = 3) +
  xlim(-1.3, 3.7) +
  labs(x = "Salary",
       y = "Expenditures",
       title = "State Legislative Professionalism by Expenditures & Salary",
       subtitle = "Clusters from k-means Algorithm") +
  theme_bw()


# Total x Regular Session Length
trs <- ggplot(st_scale, aes(t_slength, slength, 
                     color = Cluster, 
                     label = states)) +
  geom_jitter() +
  geom_label(aes(label = states, 
                 color = Cluster), 
             size = 3) + 
  xlim(-1.3, 4.5) +
  labs(x = "Total Session Length",
       y = "Regular Session Length",
       title = "State Legislative Professionalism by Total & Regular Session Length",
       subtitle = "Clusters from k-means Algorithm") +
  theme_bw()

grid.arrange(es, trs, nrow = 1)

## Internal validation check
# First define a matrix
st_scale_dist_m <- st_scale_dist %>% 
  as.matrix(); head(st_scale_dist_m)

st_prof.internal.k <- clValid(st_scale_dist_m, 2:10, 
                           clMethods = c("hierarchical", "kmeans"), 
                           validation = "internal")
summary(st_prof.internal.k)

par(mfrow = c(2, 2))
plot(st_prof.internal.k, legend = FALSE,
     type = "l",
     main = "")
par(mfrow=c(1,1)) # Reset plot pane space