# -*- coding: utf-8 -*-
"""
Created on Mon Aug 01 18:15:05 2016
Polk County arrest records crawler
@author: Tammy
"""
import urllib2
from bs4 import BeautifulSoup
import re
import pandas as pd

# define base URL for county website
base = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
# empty tuple that we will fill with dicts later
master = []
   
# get the link IDs for each inmate page so that we can iterate over each
url = urllib2.urlopen(base)
doc = url.read()
src = BeautifulSoup(doc, 'html.parser')

arrests = src.find_all('a') # find all link tags on our table of contents page
pattern = re.compile(".*(bi)") # the pattern we're looking for in the links

# iterate through each link, check that it's the type we want, grab the rest of
# the URL, then parse through each arrest page to grab info
for arrest in range(1, len(arrests)): 
    if pattern.match(arrests[arrest]['href']) :
        page = arrests[arrest]['href']
        site = base + page
        url2 = urllib2.urlopen(site)
        doc2 = url2.read()
        src2 = BeautifulSoup(doc2, 'html.parser')

        # create a list of all td tags on the arrest record page
        td = src2.find_all('td')
        # clean up the td tag strings
        b = list()
        for t in td:
            b.append(str(t.string.strip()))
        
        # define a dictionary of all the info housed in the tags
        info = {'ID': b[0], 'Name': b[1], 'Arrest date/time': b[2], 'Age': int(b[3]), 
        'Height': b[4], 'Weight': int(b[5]), 'Race': b[6], 'Sex': b[7], 
        'Eyes': b[8], 'Hair': b[9], 'Case #': b[10], 'Description': b[11], 
        'Bond': b[12], 'Bond Type': b[13]}
        
        #add the latest info to a master tuple
        master.append(info)
        
    
# turn into a data frame that we can play with
data = pd.DataFrame(master)

# convert certain columns to categoricals
cats = ['Race', 'Sex', 'Eyes', 'Hair', 'Description', 'Bond Type']
for cat in cats:
    data[cat] = data[cat].astype('category')

######################################################
#------------ NEW CODE FOR PART TWO ------------------
######################################################

import plotly
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.tools import FigureFactory as FF 
from decimal import *
from re import sub
from scipy.special import ndtr

# setup for our tools
plotly.tools.set_credentials_file(username='tammylarmstrong', api_key='yourapikeyhere')

# formatting
legend = dict(x=0, y=100)

#layouts
def grouped_bar_layout(chartitle):
    global grouped_bar
    grouped_bar = go.Layout(
        xaxis=dict(tickangle=45),
        legend=legend,
        title = chartitle,
        barmode='group')

# grouped bar chart comparing actual racial distribution to census racial distribution
# census race groupings don't entirely match the inmate categories so we'll put
# mismatches in the "unknown" category
inmatepop = len(data.Race)
racecats = data.Race.cat.categories
# calculate each race's proportion of inmates
raceraw = data.Race.value_counts(sort=False)
raceprop = (data.Race.value_counts(sort=False)/inmatepop).values

# round this to two decimals to match our census data
raceprop2 = []
for prop in range(0,len(raceprop)):
    rounded = round(Decimal(raceprop[prop]),2)
    raceprop2.append(rounded)

# plotting the bar chart
actual = go.Bar(
    x=racecats.tolist(),
    y=raceprop2,
    name = 'Actual Proportion')
census = go.Bar(
    x=['Asian', 'Black', 'Pacific Islander', 'Unknown', 'White'],
    y=[0.02, 0.07, 0, 0.05, 0.86 ],
    name = 'Census Proportion')
race_v_cens = [actual, census]

grouped_bar_layout('Census vs. Inmate Data: Race')
fig = go.Figure(data=race_v_cens, layout=grouped_bar)
py.plot(fig, filename='census-vs-actual-race')

# check the statistical significance of the difference between
# the census and inmate proportions of Blacks in Polk County
# using a one-sample proportions test.

ZNum = (0.07 - raceprop2[1])
ZDen = (0.07*(1-0.07)/float(inmatepop))**.5
Z = ZNum/ZDen

# If the absolute value of Z is greater than 1.65, we reject the null hypothesis
print(Z)

-25.0942310125

# get the p-value
print(ndtr(Z))

2.87471807598e-139

# compare racial distributions by gender
ct = pd.crosstab(data.Race, [data.Sex], rownames=['Race'], colnames=['Sex'])
female = go.Bar(x=racecats.tolist(),
                y=ct.Female.values.tolist(),
                name = 'Female')
male = go.Bar(x=racecats.tolist(),
              y=ct.Male.values.tolist(),
              name = 'Male')
sex_race = [male, female]

def stack_layout(chartitle):
    global stack
    stack = go.Layout(barmode='stack',
                  xaxis=dict(tickangle=45),
                  title = chartitle,
                  legend=legend)
stack_layout('Inmate Gender Distribution by Race')
fig2 = go.Figure(data=sex_race, layout=stack)
py.plot(fig2, filename='race-by-sex')

# histogram of ages
ageMin = min(data.Age)
ageMax = max(data.Age)
ageBin = ((max(data.Age)-min(data.Age))/5)

ages = [go.Histogram(x=data.Age,
                     autobinx=False,
                     xbins=dict(
                         start=ageMin,
                         end=ageMax,
                         size=ageBin))]
stack_layout('Inmate Age Distribution')    
fig2a = go.Figure(data=ages, layout=stack)
py.plot(fig2a, filename='age')

# ages by gender
fAge = data.loc[data['Sex'] == 'Female'].Age
mAge = data.loc[data['Sex'] == 'Male'].Age
female1 = go.Histogram(x=fAge, 
                       name = 'Female',
                       autobinx=False,
                       xbins=dict(
                           start=ageMin,
                           end=ageMax,
                           size=ageBin
                       ))
male1 = go.Histogram(x=mAge, 
                     name = 'Male',
                     autobinx=False,
                     xbins=dict(
                         start=ageMin,
                         end=ageMax,
                         size=ageBin
                         ))
sex_age = [male1, female1]
stack_layout('Inmate Age Distribution by Gender')
sex_age_hist = go.Figure(data=sex_age, layout=stack)
py.plot(sex_age_hist, filename='age-by-sex')

# ages by race
fig3 = []
for race in racecats:
   x = data.loc[data['Race'] == race].Age
   hist = go.Histogram(x=x,
                       name = race,
                       autobinx=False,
                       xbins=dict(
                           start=ageMin,
                           end=ageMax,
                           size=ageBin
                        ))
   fig3.append(hist)

stack_layout('Inmate Age Distribution by Race')
race_age_hist = go.Figure(data=fig3, layout=stack)
py.plot(race_age_hist, filename='age-by-race')

# list of top 10 most frequent offenses of current inmates
topoffenses = data.Description.value_counts().nlargest(10)
top10 = [['Description of Arrest', 'Count']]
for i in range(0, 9):
    entry = [topoffenses.index[i], topoffenses[i]]
    top10.append(entry)

table = FF.create_table(top10)
py.plot(table, filename='top-10-offenses')

# analyze patterns in bonds
# first, convert the bond currency strings into integers and add to our data frame
bonds_num = list()
for bond in data.Bond.values:
    value = int(sub(r'[^\d.]', '', bond))
    bonds_num.append(value)
data['BondNum'] = pd.Series(bonds_num, index=data.index)
data = data.sort_values('BondNum')

# average bond value is:
data.BondNum.mean()
data['BondNum'].groupby(data['Race']).mean()

Race
Asian               10636.363636
Black               28664.457831
Pacific Islander     1333.333333
White               22373.630137
Name: BondNum, dtype: float64

# most prisoners don't have a bond:
data.BondNum.value_counts()

0          319
5000       113
2000        92
1000        78
10000       55
100000      32
25000       31
50000       30
500         17
300         16
4000        13
500000      12
1000000      5
20000        5
15000        4
2500         4
200000       4
6000         2
250000       2
12000        1
75000        1
85000        1
40000        1
125000       1
120000       1
155000       1
1850         1
225000       1
26000        1
54000        1
3000         1
3500         1
Name: BondNum, dtype: int64

# histogram of bond values:
bonds = [go.Histogram(x=data.BondNum,
                      autobinx=False,
                      xbins=dict(
                          start=-5000,
                          end=1005000,
                          size=10000
                          )
                      )]
stack_layout('Distribution of Bond Amounts, in Dollars')
bond_hist = go.Figure(data=bonds, layout=stack)
py.plot(bond_hist, filename='bonds')

fig5 = []
for race in racecats:
    x = data.loc[data['Race'] == race].BondNum
    hist = go.Histogram(x=x, 
                        name = race,
                        autobinx=False,
                        xbins=dict(
                          start=-5000,
                          end=1005000,
                          size=10000
                          ))
    fig5.append(hist)

stack_layout('Distribution of Bond Amounts by Race')
race_bond_hist = go.Figure(data=fig5, layout = stack)
py.plot(race_bond_hist, filename='bonds-by-race2')

Code from part TWO of the polk county inmates series