# Unsupervised Machine Learning for Clustering in Political and Social Research
# Philip D. Waggoner, pdwaggoner@uchicago.edu

# Section 6

# Note: load packages from Section 1 first

# Advanced methods

## 1: Fuzzy c-means
# First, redefine scaled df
st <- x %>% 
  filter(sessid == "2009/10") %>% 
  select(-c(fips, stateabv, sessid, mds1, mds2, year)) %>%
  na.omit(st); skim(st)

st_scale <- data.frame(scale(st[,2:5]))

# Load the library
library(e1071)

# Fit and store the FCM model
cm <- cmeans(st_scale, 
                 centers = 2, 
                 m = 2)

st_scale$Cluster <- cm$cluster 
st_scale$Cluster <- as.factor( 
  ifelse(st_scale$Cluster == 1, 2, 1)
) 

table(st_scale$Cluster) 

# Visualize FCM over expenditures and salaries
ggplot(st_scale, aes(salary_real, expend, 
                           color = Cluster, 
                           label = states)) +
  geom_jitter() +
  geom_label(aes(label = states, 
                 color = Cluster), 
             size = 3) + 
  xlim(-1.3, 3.7) +
  labs(x = "Salary",
       y = "Expenditures",
       title = "State Legislative Professionalism by Expenditures & Salary",
       subtitle = "Clusters from Fuzzy C-Means Algorithm") +
  theme_bw()

# More precise numeric inspection
states <- st$state
membership <- as.data.frame(cm$membership[1:10,])
rownames(membership) <- states[1:10]
membership <- membership[ ,c(2,1)]
colnames(membership) <- c("Cluster 1", "Cluster 2")
round(membership, 2)


## 2: DBSCAN
# First, redefine scaled df and munge
st <- x %>% 
  filter(sessid == "2009/10") %>% 
  select(-c(fips, stateabv, sessid, mds1, mds2, year)) %>%
  na.omit(st); skim(st)

st_scale <- data.frame(scale(st[,2:5])) %>% 
  rename(`Total Length` = t_slength,
         `Regular Length` = slength,
         `Salary` = salary_real,
         `Expenditures` = expend) %>% 
  as.matrix()

# Load the library
library(dbscan)

# Determine optimal epsilon value
kNNdistplot(st_scale, 
            k = 4)
abline(h = 1.2, 
       col = "red")

# run the algorithm
dbscan_fit <- dbscan(st_scale, 
                     eps = 1.2, 
                     minPts = 4)

# Visualize all features: visualize across all raw values
### Re: colors - cluster 1, then 2 in "" hex colors from ggplot2 defaults for consistency
pairs(st_scale, 
      col = ifelse(dbscan_fit$cluster == 1, "#F8766D", "#00BFC4"), 
      pch = 19)

# Visualize cluster assignment
rownames(st_scale) <- st$state

fviz_cluster(dbscan_fit, st_scale, 
             repel = TRUE,
             show.clust.cent = FALSE,
             outlier.color = "#00BFC4", 
             labelsize = 7,
             pointsize = 1.5, 
             main = "Cluster Assignments from DBSCAN Algorithm") +
  theme_bw()


## 3: PAM
# First, redefine data and munge
st <- x %>% 
  filter(sessid == "2009/10") %>% 
  select(-c(fips, stateabv, sessid, mds1, mds2, year)) %>%
  na.omit(st); skim(st)

st_scale <- data.frame(scale(st[,2:5])) %>% 
  rename(`Total Length` = t_slength,
         `Regular Length` = slength,
         `Salary` = salary_real,
         `Expenditures` = expend)

st_scale$State <- states

# Load the library
library(cluster)

# Fit the algorithm
pam_fit <- pam(st_scale, 
               k = 2,
               metric = "euclidean")

# Store clusters for plotting
st_scale$Cluster <- as.factor(pam_fit$cluster)

table(st_scale$Cluster) # inspect

# PAM over expenditures and salaraies
pam <- ggplot(st_scale, aes(Salary, Expenditures, 
                     color = Cluster, 
                     label = states)) +
  geom_jitter() +
  geom_label(aes(label = states, 
                 color = Cluster), 
             size = 3) + 
  xlim(-1.3, 3.7) +
  labs(x = "Salary",
       y = "Expenditures",
       title = "State Legislative Professionalism by Expenditures & Salary",
       subtitle = "Clusters from PAM Algorithm") +
  theme_bw()
pam

### Compare to k-means
st_scale$Cluster <- as.factor(km$cluster) # save clusters in df
table(st_scale$Cluster)

# Expenditures and salaraies
es <- ggplot(st_scale, aes(Salary, Expenditures, 
                           color = Cluster, 
                           label = states)) +
  geom_jitter() +
  geom_label(aes(label = states, 
                 color = Cluster), 
             size = 3) +
  xlim(-1.3, 3.7) +
  labs(title = "State Legislative Professionalism by Expenditures & Salary",
       subtitle = "Clusters from k-means Algorithm") +
  theme_bw()

grid.arrange(pam, es, nrow = 1)