Title

INMATE PROFILES

Last, but not least, I will use a clustering technique to determine whether there are any common "profiles" of inmates, based on some demographics and the type of crime they were arrested for. Conceptually, this process is very similar to how I text-mined the arrest descriptions to create arrest categories: using measures of similarity to find naturally occurring, relatively homogeneous groups. I am using a method called k-modes, which is similar to the k-means method I used in my last post on the arrest descriptions, but which is more appropriate for the types of variables I'm working with. I don't know how many clusters is the appropriate number, so I'll try 2 through 20 and see if there's a cutoff point beyond which the addition of more clusters does not improve the overall solution by much.

library(MASS)
library(rpart)
library(klaR)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

# read in our inmates data, as of approximately 4am CT on 9/3/2016

data <- read.csv("C:/Users/Tammy/Documents/Arrests20160903.csv", header = TRUE,
                    sep="," , stringsAsFactors = TRUE)

# finally, we can attempt to cluster the inmates into profiles using k-modes, a method appropriate
# for categorical data. In this case, we need all of our data to be categorical, so we'll bucket
# Age into factors using the cut() function

hist(data$Age)

data$AgeBin <- cut(data$Age, breaks=5)
plot(data$AgeBin)

profiles <- kmodes(data[c("AgeBin", "Race", "Sex", "ClusterName")], modes=2)
wd <- data.frame(k = 2, wd = sum(profiles$withindiff))

for(k in 3:20){
  profiles <- kmodes(data[c("AgeBin", "Race", "Sex", "ClusterName")], modes=k)
  wd <- rbind(wd, data.frame(k = k, wd = sum(profiles$withindiff)))
  print(wd[k-1,])
}

##   k   wd
## 2 3 1180
##   k  wd
## 3 4 940
##   k  wd
## 4 5 882
##   k  wd
## 5 6 897
##   k  wd
## 6 7 768
##   k  wd
## 7 8 815
##   k  wd
## 8 9 714
##    k  wd
## 9 10 726
##     k  wd
## 10 11 731
##     k  wd
## 11 12 681
##     k  wd
## 12 13 757
##     k  wd
## 13 14 626
##     k  wd
## 14 15 739
##     k  wd
## 15 16 617
##     k  wd
## 16 17 669
##     k  wd
## 17 18 622
##     k  wd
## 18 19 644
##     k  wd
## 19 20 619

plot(wd$k, wd$wd, main = "Within Cluster Sum of Squared Differences")

profiles <- kmodes(data[c("AgeBin", "Race", "Sex", "ClusterName")], modes=5)
results <- profiles$modes
results

##        AgeBin  Race    Sex         ClusterName
## 1 (17.9,29.6] White   Male                Misc
## 2 (29.6,41.2] White   Male                Misc
## 3 (29.6,41.2] White Female Probation Violation
## 4 (29.6,41.2] Black   Male                Misc
## 5 (17.9,29.6] White Female                Misc

# 
# AgeBin  Race    Sex         ClusterName
# 1 (29.6,41.2] White   Male                Misc
# 2 (17.9,29.6] Black   Male                Misc
# 3 (29.6,41.2] White Female Probation Violation
# 4 (41.2,52.8] White   Male           Narcotics
# 5 (41.2,52.8] Black   Male                Misc

profiles$size

## cluster
##   1   2   3   4   5 
## 394 180  96 116  55

# cluster sizes
# cluster
# 1   2   3   4   5 
# 468 148 112  81  32 

# middle aged white men
# young black men
# middle aged white females on probation violation
# older white men on narcotics charges
# older black men

profiles2 <- kmodes(data[c("AgeBin", "Race", "Sex", "ClusterName")], modes=12)
results2 <- profiles2$modes
results2

##         AgeBin  Race    Sex         ClusterName
## 1  (17.9,29.6] White   Male                Misc
## 2  (17.9,29.6] White Female                Misc
## 3  (41.2,52.8] White   Male                Misc
## 4  (29.6,41.2] White Female Probation Violation
## 5  (29.6,41.2] Black   Male                Misc
## 6  (52.8,64.4] Black   Male           Narcotics
## 7  (41.2,52.8] Black   Male                Misc
## 8  (17.9,29.6] Black   Male Probation Violation
## 9  (29.6,41.2] White   Male    DWI/Intoxication
## 10 (17.9,29.6] Black   Male           Narcotics
## 11 (64.4,76.1] White   Male        Interference
## 12 (17.9,29.6] Black Female Probation Violation