# Unsupervised Machine Learning for Clustering in Political and Social Research
# Philip D. Waggoner, pdwaggoner@uchicago.edu

# Section 5

# Note: load packages from Section 1 first

# Fit the model
set.seed(634)
gmm1 <- mvnormalmixEM(st_scale[ ,1:4])

# Get counts for each component and organize
posterior_df <- as.data.frame(cbind(gmm1$x, gmm1$posterior))
posterior_df$Component <- as.factor(ifelse(posterior_df$comp.1 > 0.5, 1, 2))
posterior_df$State <- states
st_scale$State <- states
posterior_df$Cluster_km <- st_scale$Cluster

full <- merge(posterior_df, st_scale, by = c("State")) %>% 
  rename(salary = salary_real.x) %>% 
  rename(expenditures = expend.x)

# Now plot and then compare each visually
km_plot <- full %>%
  ggplot(aes(salary, expenditures, 
             color = Cluster_km, 
             label = states)) +
  geom_jitter() +
  geom_label(aes(label = states, 
                 color = Cluster_km), 
             size = 3) + 
  xlim(-1.3, 3.7) +
  labs(x = "Salary",
       y = "Expenditures",
       title = "State Legislative Professionalism by Expenditures & Salary",
       subtitle = "Clusters from k-means Algorithm") +
  theme_bw()


gmm_plot <- full %>%
  ggplot(aes(salary, expenditures, 
             color = Component, 
             label = states)) +
  geom_jitter() +
  geom_label(aes(label = states, 
                 color = Component), 
             size = 3) + 
  xlim(-1.3, 3.7) +
  labs(x = "Salary",
       y = "Expenditures",
       title = "State Legislative Professionalism by Expenditures & Salary",
       subtitle = "Clusters from Gaussian Mixture Model") +
  theme_bw()


# First gmm only
gmm_plot

# Next, arrange both plots side by side and compare
grid.arrange(km_plot, gmm_plot, nrow = 1)


# Posterior densities for evaluation
component_probs <- as.data.frame(cbind(gmm1$x, gmm1$posterior))

# numeric/table
probs <- round(head(component_probs, 10), 3)
rownames(probs) <- head(states, 10)
probs

# visualize
component_probs <- as.data.frame(cbind(gmm1$x, gmm1$posterior)) %>% 
  mutate(Component = as.factor(ifelse(comp.1 > 0.5, 1, 2)))

# Visualize counts of states in components
compBAR <- ggplot(component_probs, aes(factor(Component))) +
  geom_bar(aes(fill = Component), stat ="count") +
  labs(x = "Component Assignment",
       y = "Count of States",
       color = "Component") +
  theme_bw()
compBAR


## Internal validation check
# First define a matrix
st_scale_dist_m <- st_scale_dist %>% 
  as.matrix(); head(st_scale_dist_m)

st_prof.internal.gmm <- clValid(st_scale_dist_m, 2:10, 
                                 clMethods = c("hierarchical", "kmeans", "model"), 
                                 validation = "internal")

summary(st_prof.internal.gmm)

par(mfrow = c(2, 2))
plot(st_prof.internal.gmm, legend = FALSE,
     type = "l",
     main = " ")
par(mfrow=c(1,1)) # Reset plot pane space