# -*- coding: utf-8 -*- """ Created on Mon Aug 22 22:03:44 2016 @author: Tammy """ """ ###################################################### #--------- SCRAPING CODE FROM PART ONE --------------- ###################################################### """ import urllib2 from bs4 import BeautifulSoup import re import pandas as pd # define base URL for county website base = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/' # empty tuple that we will fill with dicts later master = [] # get the link IDs for each inmate page so that we can iterate over each url = urllib2.urlopen(base) doc = url.read() src = BeautifulSoup(doc, 'html.parser') arrests = src.find_all('a') # find all link tags on our table of contents page pattern = re.compile(".*(bi)") # the pattern we're looking for in the links # iterate through each link, check that it's the type we want, grab the rest of # the URL, then parse through each arrest page to grab info for arrest in range(1, len(arrests)): if pattern.match(arrests[arrest]['href']) : page = arrests[arrest]['href'] site = base + page url2 = urllib2.urlopen(site) doc2 = url2.read() src2 = BeautifulSoup(doc2, 'html.parser') # create a list of all td tags on the arrest record page td = src2.find_all('td') # clean up the td tag strings b = list() for t in td: b.append(str(t.string.strip())) # define a dictionary of all the info housed in the tags info = {'ID': b[0], 'Name': b[1], 'Arrest date/time': b[2], 'Age': int(b[3]), 'Height': b[4], 'Weight': int(b[5]), 'Race': b[6], 'Sex': b[7], 'Eyes': b[8], 'Hair': b[9], 'Case #': b[10], 'Description': b[11], 'Bond': b[12], 'Bond Type': b[13]} #add the latest info to a master tuple master.append(info) # turn into a data frame that we can play with data = pd.DataFrame(master) # convert certain columns to categoricals cats = ['Race', 'Sex', 'Eyes', 'Hair', 'Description', 'Bond Type'] for cat in cats: data[cat] = data[cat].astype('category') """ ###################################################### #----------- NEW CODE FOR PART THREE------------------ ###################################################### """ import nltk from nltk.stem.porter import * from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import matplotlib.pyplot as plt import re # unique arrest descriptions and their frequencies offenses = data.Description.value_counts() # several common words in the list of offenses so let's manually bucket them # and see how those buckets are distributed offense_bucket = list() for each in data.Description: if "VIOLATION" in each: new = "Violation" elif "POSSESS" in each: new = "Possession" elif "ASSAULT" in each: new = "Assault" elif "THEFT" in each: new = "Theft" elif "MURDER" in each: new = "Murder" elif "NARC" in each: new = "Narcotics" else: new = "Other" offense_bucket.append(new) data['OffenseBucket'] = pd.Series(offense_bucket, index=data.index) data.OffenseBucket.value_counts() # the "Other" bucket is the largest bucket, so we're clearly missing some other # categories. """ Alternatively, we can allow the arrest types to cluster themselves using text mining techniques. To keep things clear, from now on let's refer to each individual offense description as a "document", and each unique word and/or item of punctuation as a "token". """ # let's get down to just the unique documents: unique_docs = list(set(data.Description)) # let's pause and see some basic stats on the text data we're working with. # unique documents: print("There are %d unique arrest types." %len(unique_docs)) unique_docs[1:5] # clean up our data to remove numbers, symbols, and words like "violation", "degree", "offense" # in such short documents, words like "degree" can seem disproportionately important # I don't care too much about the severity of the crime so much as the type, so # I don't want to create a cluster that contains all the 1st, 2nd, and 3rd degree crimes # stem so terms like "possession" and "possess" are treated as the same stemmer = PorterStemmer() clean_docs = [] for doc in unique_docs: word_list = [] for word in nltk.word_tokenize(doc): if word.isalpha() and word not in ["DEGREE", "OFFENSE", "VIOLATION", "OFFENDER", "FAILURE"]: word_list.append(stemmer.stem(word.lower())) word_list2 = " ".join(word_list) clean_docs.append(word_list2) clean_docs[1:5] # calculate the term frequency-inverse document frequency for each document. # this is a way of calculating how important each token is in the context of the # document, while also weighing how frequently each token happens across documents. # if a term happens frequently in a single document but not frequently across all documents # it is important to that document. # the default settings for TfidfVectorizer take care of removing symbols and punctuation, # and converting to lowercase tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english') tfidf_matrix = tfidf.fit_transform(clean_docs) print(tfidf_matrix.shape) # the first number is our number of unique documents, and the second is the number # of unique terms # now we use k-means clustering to define k clusters. We don't know what the best # number of clusters is, so we try a bunch and will compare them later. inertia = [] ktrials = [] for k in range(2, 30): km = KMeans(n_clusters=k, n_init=100, random_state=12345, init='k-means++') km.fit(tfidf_matrix) inertia.append(km.inertia_) ktrials.append(float(k)) # plot our results and look for an "elbow" plt.plot(ktrials, inertia) plt.ylabel('inertia') plt.xlabel('clusters') plt.show() # closest thing to an "elbow" is at 8, but this leaves out some important categories # let's expand to 15 - more than that would be onerous km = KMeans(n_clusters=15, n_init=100, random_state=12345, init='k-means++') km.fit(tfidf_matrix) clusters = km.labels_.tolist() # join back to the original, unprocessed list of arrest descriptions results = pd.DataFrame(unique_docs, index = [clusters], columns = ['Description']) results['Cluster'] = results.index # number of arrest descriptions in each cluster: results.index.value_counts(sort=False) # what arrest descriptions are in each cluster? [results.loc[i] for i in range(0, 15)] # some of these work quite well - there's a narcotics cluster, a theft cluster, etc. # but there is definitely a "miscellaneous" category that doesn't really go away # even when expanding the number of clusters. If I expand to 10 or 15 clusters, # I get more categories like intoxication, driving offenses, murder, and evenStill, it's better than my manual # assignments. # giving the clusters names. Not a lot of overlap although a sizeable miscellaneous category results.loc[results['Cluster'] == 0, 'ClusterName'] = 'Criminal Mischief' results.loc[results['Cluster'] == 1, 'ClusterName'] = 'Theft' results.loc[results['Cluster'] == 2, 'ClusterName'] = 'Weapons' results.loc[results['Cluster'] == 3, 'ClusterName'] = 'Misc' results.loc[results['Cluster'] == 4, 'ClusterName'] = 'Controlled Substances' results.loc[results['Cluster'] == 5, 'ClusterName'] = 'Driving Under Influence' results.loc[results['Cluster'] == 6, 'ClusterName'] = 'Domestic/Sexual Abuse' results.loc[results['Cluster'] == 7, 'ClusterName'] = 'Trespassing' results.loc[results['Cluster'] == 8, 'ClusterName'] = 'Murder' results.loc[results['Cluster'] == 9, 'ClusterName'] = 'Injury' results.loc[results['Cluster'] == 10, 'ClusterName'] = 'Burglary' results.loc[results['Cluster'] == 11, 'ClusterName'] = 'Narcotics' results.loc[results['Cluster'] == 12, 'ClusterName'] = 'Arson' results.loc[results['Cluster'] == 13, 'ClusterName'] = 'Harrassment' results.loc[results['Cluster'] == 14, 'ClusterName'] = 'Intoxication/Drunk Driving' # merge this back with our full arrest data data = pd.merge(data, results, on='Description') data.ClusterName.value_counts() # still a large misc category, but a good distribution of other categories """ how are the arrest types distributed across all individuals? by race? by gender? time to do some graphing again """ import plotly import plotly.graph_objs as go import plotly.plotly as py # setup for our tools plotly.tools.set_credentials_file(username='tammylarmstrong', api_key='########') # formatting legend = dict(x=0, y=100) def stack_layout(chartitle): global stack stack = go.Layout(barmode='stack', xaxis=dict(tickangle=45), title = chartitle, legend=legend) # compare offense types by gender ct = pd.crosstab(data.ClusterName, [data.Sex], rownames=['Offense Type'], colnames=['Sex']) data.ClusterName = data.ClusterName.astype('category') arrestcats = data.ClusterName.cat.categories female = go.Bar(x=arrestcats.tolist(), y=ct.Female.values.tolist(), name = 'Female') male = go.Bar(x=arrestcats.tolist(), y=ct.Male.values.tolist(), name = 'Male') sex_clusters = [male, female] stack_layout('Arrest Types by Gender') fig1 = go.Figure(data=sex_clusters, layout=stack) py.plot(fig1, filename='arrest-types-by-gender') # show race as a percentage of total arrests of that type from __future__ import division ct2 = pd.crosstab(data.ClusterName, data.Race).apply(lambda r: r/r.sum(), axis=1) ct2_round = ct2.round(2) Black = go.Bar(x=arrestcats.tolist(), y=ct2_round.Black.values.tolist(), name = 'Black') White = go.Bar(x=arrestcats.tolist(), y=ct2_round.White.values.tolist(), name = 'White') Asian = go.Bar(x=arrestcats.tolist(), y=ct2_round.Asian.values.tolist(), name = 'Asian') Pacific = go.Bar(x=arrestcats.tolist(), y=ct2_round['Pacific Islander'].values.tolist(), name = 'Pacific Islander') Unknown = go.Bar(x=arrestcats.tolist(), y=ct2_round['Unknown'].values.tolist(), name='Unknown') race_clusters = [Black, White, Asian, Pacific, Unknown] layout = go.Layout(barmode='stack', title='Race as percentage of each arrest type', legend=legend, yaxis=dict(range=[0,1]), annotations=[ dict(x=xi,y=[0.9]*13, text=str(yi), xanchor='center', yanchor='bottom', showarrow=False, ) for xi, yi in zip(arrestcats, data.ClusterName.value_counts(sort=False))] ) fig2 = go.Figure(data=race_clusters, layout=layout) py.plot(fig2, filename='arrest-types-by-race')