Code from part TWO of the polk county inmates series
In [2]:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 01 18:15:05 2016
Polk County arrest records crawler
@author: Tammy
"""
import urllib2
from bs4 import BeautifulSoup
import re
import pandas as pd
# define base URL for county website
base = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
# empty tuple that we will fill with dicts later
master = []
# get the link IDs for each inmate page so that we can iterate over each
url = urllib2.urlopen(base)
doc = url.read()
src = BeautifulSoup(doc, 'html.parser')
arrests = src.find_all('a') # find all link tags on our table of contents page
pattern = re.compile(".*(bi)") # the pattern we're looking for in the links
# iterate through each link, check that it's the type we want, grab the rest of
# the URL, then parse through each arrest page to grab info
for arrest in range(1, len(arrests)):
if pattern.match(arrests[arrest]['href']) :
page = arrests[arrest]['href']
site = base + page
url2 = urllib2.urlopen(site)
doc2 = url2.read()
src2 = BeautifulSoup(doc2, 'html.parser')
# create a list of all td tags on the arrest record page
td = src2.find_all('td')
# clean up the td tag strings
b = list()
for t in td:
b.append(str(t.string.strip()))
# define a dictionary of all the info housed in the tags
info = {'ID': b[0], 'Name': b[1], 'Arrest date/time': b[2], 'Age': int(b[3]),
'Height': b[4], 'Weight': int(b[5]), 'Race': b[6], 'Sex': b[7],
'Eyes': b[8], 'Hair': b[9], 'Case #': b[10], 'Description': b[11],
'Bond': b[12], 'Bond Type': b[13]}
#add the latest info to a master tuple
master.append(info)
# turn into a data frame that we can play with
data = pd.DataFrame(master)
# convert certain columns to categoricals
cats = ['Race', 'Sex', 'Eyes', 'Hair', 'Description', 'Bond Type']
for cat in cats:
data[cat] = data[cat].astype('category')
In [3]:
######################################################
#------------ NEW CODE FOR PART TWO ------------------
######################################################
import plotly
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.tools import FigureFactory as FF
from decimal import *
from re import sub
from scipy.special import ndtr
# setup for our tools
plotly.tools.set_credentials_file(username='tammylarmstrong', api_key='yourapikeyhere')
# formatting
legend = dict(x=0, y=100)
#layouts
def grouped_bar_layout(chartitle):
global grouped_bar
grouped_bar = go.Layout(
xaxis=dict(tickangle=45),
legend=legend,
title = chartitle,
barmode='group')
# grouped bar chart comparing actual racial distribution to census racial distribution
# census race groupings don't entirely match the inmate categories so we'll put
# mismatches in the "unknown" category
inmatepop = len(data.Race)
racecats = data.Race.cat.categories
# calculate each race's proportion of inmates
raceraw = data.Race.value_counts(sort=False)
raceprop = (data.Race.value_counts(sort=False)/inmatepop).values
# round this to two decimals to match our census data
raceprop2 = []
for prop in range(0,len(raceprop)):
rounded = round(Decimal(raceprop[prop]),2)
raceprop2.append(rounded)
# plotting the bar chart
actual = go.Bar(
x=racecats.tolist(),
y=raceprop2,
name = 'Actual Proportion')
census = go.Bar(
x=['Asian', 'Black', 'Pacific Islander', 'Unknown', 'White'],
y=[0.02, 0.07, 0, 0.05, 0.86 ],
name = 'Census Proportion')
race_v_cens = [actual, census]
grouped_bar_layout('Census vs. Inmate Data: Race')
fig = go.Figure(data=race_v_cens, layout=grouped_bar)
py.plot(fig, filename='census-vs-actual-race')
# check the statistical significance of the difference between
# the census and inmate proportions of Blacks in Polk County
# using a one-sample proportions test.
ZNum = (0.07 - raceprop2[1])
ZDen = (0.07*(1-0.07)/float(inmatepop))**.5
Z = ZNum/ZDen
# If the absolute value of Z is greater than 1.65, we reject the null hypothesis
print(Z)
In [4]:
# get the p-value
print(ndtr(Z))
In [5]:
# compare racial distributions by gender
ct = pd.crosstab(data.Race, [data.Sex], rownames=['Race'], colnames=['Sex'])
female = go.Bar(x=racecats.tolist(),
y=ct.Female.values.tolist(),
name = 'Female')
male = go.Bar(x=racecats.tolist(),
y=ct.Male.values.tolist(),
name = 'Male')
sex_race = [male, female]
def stack_layout(chartitle):
global stack
stack = go.Layout(barmode='stack',
xaxis=dict(tickangle=45),
title = chartitle,
legend=legend)
stack_layout('Inmate Gender Distribution by Race')
fig2 = go.Figure(data=sex_race, layout=stack)
py.plot(fig2, filename='race-by-sex')
# histogram of ages
ageMin = min(data.Age)
ageMax = max(data.Age)
ageBin = ((max(data.Age)-min(data.Age))/5)
ages = [go.Histogram(x=data.Age,
autobinx=False,
xbins=dict(
start=ageMin,
end=ageMax,
size=ageBin))]
stack_layout('Inmate Age Distribution')
fig2a = go.Figure(data=ages, layout=stack)
py.plot(fig2a, filename='age')
# ages by gender
fAge = data.loc[data['Sex'] == 'Female'].Age
mAge = data.loc[data['Sex'] == 'Male'].Age
female1 = go.Histogram(x=fAge,
name = 'Female',
autobinx=False,
xbins=dict(
start=ageMin,
end=ageMax,
size=ageBin
))
male1 = go.Histogram(x=mAge,
name = 'Male',
autobinx=False,
xbins=dict(
start=ageMin,
end=ageMax,
size=ageBin
))
sex_age = [male1, female1]
stack_layout('Inmate Age Distribution by Gender')
sex_age_hist = go.Figure(data=sex_age, layout=stack)
py.plot(sex_age_hist, filename='age-by-sex')
# ages by race
fig3 = []
for race in racecats:
x = data.loc[data['Race'] == race].Age
hist = go.Histogram(x=x,
name = race,
autobinx=False,
xbins=dict(
start=ageMin,
end=ageMax,
size=ageBin
))
fig3.append(hist)
stack_layout('Inmate Age Distribution by Race')
race_age_hist = go.Figure(data=fig3, layout=stack)
py.plot(race_age_hist, filename='age-by-race')
# list of top 10 most frequent offenses of current inmates
topoffenses = data.Description.value_counts().nlargest(10)
top10 = [['Description of Arrest', 'Count']]
for i in range(0, 9):
entry = [topoffenses.index[i], topoffenses[i]]
top10.append(entry)
table = FF.create_table(top10)
py.plot(table, filename='top-10-offenses')
# analyze patterns in bonds
# first, convert the bond currency strings into integers and add to our data frame
bonds_num = list()
for bond in data.Bond.values:
value = int(sub(r'[^\d.]', '', bond))
bonds_num.append(value)
data['BondNum'] = pd.Series(bonds_num, index=data.index)
data = data.sort_values('BondNum')
# average bond value is:
data.BondNum.mean()
data['BondNum'].groupby(data['Race']).mean()
Out[5]:
In [6]:
# most prisoners don't have a bond:
data.BondNum.value_counts()
Out[6]:
In [ ]:
# histogram of bond values:
bonds = [go.Histogram(x=data.BondNum,
autobinx=False,
xbins=dict(
start=-5000,
end=1005000,
size=10000
)
)]
stack_layout('Distribution of Bond Amounts, in Dollars')
bond_hist = go.Figure(data=bonds, layout=stack)
py.plot(bond_hist, filename='bonds')
fig5 = []
for race in racecats:
x = data.loc[data['Race'] == race].BondNum
hist = go.Histogram(x=x,
name = race,
autobinx=False,
xbins=dict(
start=-5000,
end=1005000,
size=10000
))
fig5.append(hist)
stack_layout('Distribution of Bond Amounts by Race')
race_bond_hist = go.Figure(data=fig5, layout = stack)
py.plot(race_bond_hist, filename='bonds-by-race2')