# -*- coding: utf-8 -*-
"""
Created on Mon Aug 22 22:03:44 2016

@author: Tammy
"""
"""
######################################################
#--------- SCRAPING CODE FROM PART ONE ---------------
######################################################
"""

import urllib2
from bs4 import BeautifulSoup
import re
import pandas as pd

# define base URL for county website
base = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
# empty tuple that we will fill with dicts later
master = []

# get the link IDs for each inmate page so that we can iterate over each
url = urllib2.urlopen(base)
doc = url.read()
src = BeautifulSoup(doc, 'html.parser')

arrests = src.find_all('a') # find all link tags on our table of contents page
pattern = re.compile(".*(bi)") # the pattern we're looking for in the links

# iterate through each link, check that it's the type we want, grab the rest of
# the URL, then parse through each arrest page to grab info
for arrest in range(1, len(arrests)):
    if pattern.match(arrests[arrest]['href']) :
        page = arrests[arrest]['href']
        site = base + page
        url2 = urllib2.urlopen(site)
        doc2 = url2.read()
        src2 = BeautifulSoup(doc2, 'html.parser')

        # create a list of all td tags on the arrest record page
        td = src2.find_all('td')
        # clean up the td tag strings
        b = list()
        for t in td:
            b.append(str(t.string.strip()))

        # define a dictionary of all the info housed in the tags
        info = {'ID': b[0], 'Name': b[1], 'Arrest date/time': b[2], 'Age': int(b[3]),
        'Height': b[4], 'Weight': int(b[5]), 'Race': b[6], 'Sex': b[7],
        'Eyes': b[8], 'Hair': b[9], 'Case #': b[10], 'Description': b[11],
        'Bond': b[12], 'Bond Type': b[13]}

        #add the latest info to a master tuple
        master.append(info)


# turn into a data frame that we can play with
data = pd.DataFrame(master)

# convert certain columns to categoricals
cats = ['Race', 'Sex', 'Eyes', 'Hair', 'Description', 'Bond Type']
for cat in cats:
    data[cat] = data[cat].astype('category')


"""
######################################################
#----------- NEW CODE FOR PART THREE------------------
######################################################
"""

import nltk
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import re

# unique arrest descriptions and their frequencies
offenses = data.Description.value_counts()

# several common words in the list of offenses so let's manually bucket them
# and see how those buckets are distributed
offense_bucket = list()
for each in data.Description:
    if "VIOLATION" in each:
        new = "Violation"
    elif "POSSESS" in each:
        new = "Possession"
    elif "ASSAULT" in each:
        new = "Assault"
    elif "THEFT" in each:
        new = "Theft"
    elif "MURDER" in each:
        new = "Murder"
    elif "NARC" in each:
        new = "Narcotics"
    else:
        new = "Other"
    offense_bucket.append(new)

data['OffenseBucket'] = pd.Series(offense_bucket, index=data.index)
data.OffenseBucket.value_counts()

# the "Other" bucket is the largest bucket, so we're clearly missing some other
# categories.

"""
Alternatively, we can allow the arrest types to cluster themselves using
text mining techniques. To keep things clear, from now on let's refer to each
individual offense description as a "document", and each unique word and/or
item of punctuation as a "token".
"""

# let's get down to just the unique documents:
unique_docs = list(set(data.Description))

# let's pause and see some basic stats on the text data we're working with.
# unique documents:
print("There are %d unique arrest types." %len(unique_docs))
unique_docs[1:5]

# clean up our data to remove numbers, symbols, and words like "violation", "degree", "offense"
# in such short documents, words like "degree" can seem disproportionately important
# I don't care too much about the severity of the crime so much as the type, so
# I don't want to create a cluster that contains all the 1st, 2nd, and 3rd degree crimes

# stem so terms like "possession" and "possess" are treated as the same
stemmer = PorterStemmer()

clean_docs = []
for doc in unique_docs:
    word_list = []
    for word in nltk.word_tokenize(doc):
        if word.isalpha() and word not in ["DEGREE", "OFFENSE", "VIOLATION", "OFFENDER", "FAILURE"]:
           word_list.append(stemmer.stem(word.lower()))
    word_list2 = " ".join(word_list)
    clean_docs.append(word_list2)

clean_docs[1:5]
# calculate the term frequency-inverse document frequency for each document.
# this is a way of calculating how important each token is in the context of the
# document, while also weighing how frequently each token happens across documents.
# if a term happens frequently in a single document but not frequently across all documents
# it is important to that document.
# the default settings for TfidfVectorizer take care of removing symbols and punctuation,
# and converting to lowercase

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english')
tfidf_matrix = tfidf.fit_transform(clean_docs)

print(tfidf_matrix.shape)
# the first number is our number of unique documents, and the second is the number
# of unique terms

# now we use k-means clustering to define k clusters. We don't know what the best
# number of clusters is, so we try a bunch and will compare them later.
inertia = []
ktrials = []
for k in range(2, 30):
    km = KMeans(n_clusters=k, n_init=100, random_state=12345, init='k-means++')
    km.fit(tfidf_matrix)
    inertia.append(km.inertia_)
    ktrials.append(float(k))

# plot our results and look for an "elbow"
plt.plot(ktrials, inertia)
plt.ylabel('inertia')
plt.xlabel('clusters')
plt.show()

# closest thing to an "elbow" is at 8, but this leaves out some important categories
# let's expand to 15 - more than that would be onerous
km = KMeans(n_clusters=15, n_init=100, random_state=12345, init='k-means++')
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

# join back to the original, unprocessed list of arrest descriptions
results = pd.DataFrame(unique_docs, index = [clusters], columns = ['Description'])
results['Cluster'] = results.index

# number of arrest descriptions in each cluster:
results.index.value_counts(sort=False)

# what arrest descriptions are in each cluster?
[results.loc[i] for i in range(0, 15)]



# some of these work quite well - there's a narcotics cluster, a theft cluster, etc.
# but there is definitely a "miscellaneous" category that doesn't really go away
# even when expanding the number of clusters. If I expand to 10 or 15 clusters,
# I get more categories like intoxication, driving offenses, murder, and evenStill, it's better than my manual
# assignments.

# giving the clusters names. Not a lot of overlap although a sizeable miscellaneous category
results.loc[results['Cluster'] == 0, 'ClusterName'] = 'Criminal Mischief'
results.loc[results['Cluster'] == 1, 'ClusterName'] = 'Theft'
results.loc[results['Cluster'] == 2, 'ClusterName'] = 'Weapons'
results.loc[results['Cluster'] == 3, 'ClusterName'] = 'Misc'
results.loc[results['Cluster'] == 4, 'ClusterName'] = 'Controlled Substances'
results.loc[results['Cluster'] == 5, 'ClusterName'] = 'Driving Under Influence'
results.loc[results['Cluster'] == 6, 'ClusterName'] = 'Domestic/Sexual Abuse'
results.loc[results['Cluster'] == 7, 'ClusterName'] = 'Trespassing'
results.loc[results['Cluster'] == 8, 'ClusterName'] = 'Murder'
results.loc[results['Cluster'] == 9, 'ClusterName'] = 'Injury'
results.loc[results['Cluster'] == 10, 'ClusterName'] = 'Burglary'
results.loc[results['Cluster'] == 11, 'ClusterName'] = 'Narcotics'
results.loc[results['Cluster'] == 12, 'ClusterName'] = 'Arson'
results.loc[results['Cluster'] == 13, 'ClusterName'] = 'Harrassment'
results.loc[results['Cluster'] == 14, 'ClusterName'] = 'Intoxication/Drunk Driving'

# merge this back with our full arrest data
data = pd.merge(data, results, on='Description')

data.ClusterName.value_counts()
# still a large misc category, but a good distribution of other categories

""" how are the arrest types distributed across all individuals? by race? by gender?
 time to do some graphing again
"""

import plotly
import plotly.graph_objs as go
import plotly.plotly as py

# setup for our tools
plotly.tools.set_credentials_file(username='tammylarmstrong', api_key='########')

# formatting
legend = dict(x=0, y=100)

def stack_layout(chartitle):
    global stack
    stack = go.Layout(barmode='stack',
                  xaxis=dict(tickangle=45),
                  title = chartitle,
                  legend=legend)

# compare offense types by gender
ct = pd.crosstab(data.ClusterName, [data.Sex], rownames=['Offense Type'], colnames=['Sex'])
data.ClusterName = data.ClusterName.astype('category')
arrestcats = data.ClusterName.cat.categories
female = go.Bar(x=arrestcats.tolist(),
                y=ct.Female.values.tolist(),
                name = 'Female')
male = go.Bar(x=arrestcats.tolist(),
              y=ct.Male.values.tolist(),
              name = 'Male')
sex_clusters = [male, female]

stack_layout('Arrest Types by Gender')
fig1 = go.Figure(data=sex_clusters, layout=stack)
py.plot(fig1, filename='arrest-types-by-gender')

# show race as a percentage of total arrests of that type
from __future__ import division

ct2 = pd.crosstab(data.ClusterName, data.Race).apply(lambda r: r/r.sum(), axis=1)
ct2_round = ct2.round(2)

Black = go.Bar(x=arrestcats.tolist(),
               y=ct2_round.Black.values.tolist(),
               name = 'Black')
White = go.Bar(x=arrestcats.tolist(),
               y=ct2_round.White.values.tolist(),
                name = 'White')
Asian = go.Bar(x=arrestcats.tolist(),
               y=ct2_round.Asian.values.tolist(),
                name = 'Asian')
Pacific = go.Bar(x=arrestcats.tolist(),
                 y=ct2_round['Pacific Islander'].values.tolist(),
                name = 'Pacific Islander')
Unknown = go.Bar(x=arrestcats.tolist(),
                 y=ct2_round['Unknown'].values.tolist(),
                name='Unknown')

race_clusters = [Black, White, Asian, Pacific, Unknown]

layout = go.Layout(barmode='stack',
                   title='Race as percentage of each arrest type',
                   legend=legend,
                   yaxis=dict(range=[0,1]),
                   annotations=[
        dict(x=xi,y=[0.9]*13,
             text=str(yi),
             xanchor='center',
             yanchor='bottom',
             showarrow=False,
        ) for xi, yi in zip(arrestcats, data.ClusterName.value_counts(sort=False))]
)

fig2 = go.Figure(data=race_clusters, layout=layout)
py.plot(fig2, filename='arrest-types-by-race')