Code from part one of the polk county inmates series

Polk County Inmates - Part 1

Polk County Inmates - Part 1

In [9]:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 01 18:15:05 2016
Polk County arrest records crawler
@author: Tammy
"""
import urllib2
from bs4 import BeautifulSoup
import re
import pandas as pd

# define base URL for county website
base = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
# empty tuple that we will fill with dicts later
master = []
   
# get the link IDs for each inmate page so that we can iterate over each
url = urllib2.urlopen(base)
doc = url.read()
src = BeautifulSoup(doc, 'html.parser')

arrests = src.find_all('a') # find all link tags on our table of contents page
pattern = re.compile(".*(bi)") # the pattern we're looking for in the links

# iterate through each link, check that it's the type we want, grab the rest of
# the URL, then parse through each arrest page to grab info
for arrest in range(1, len(arrests)): 
    if pattern.match(arrests[arrest]['href']) :
        page = arrests[arrest]['href']
        site = base + page
        url2 = urllib2.urlopen(site)
        doc2 = url2.read()
        src2 = BeautifulSoup(doc2, 'html.parser')

        # create a list of all td tags on the arrest record page
        td = src2.find_all('td')
        # clean up the td tag strings
        b = list()
        for t in td:
            b.append(str(t.string.strip()))
        
        # define a dictionary of all the info housed in the tags
        info = {'ID': b[0], 'Name': b[1], 'Arrest date/time': b[2], 'Age': int(b[3]), 
        'Height': b[4], 'Weight': int(b[5]), 'Race': b[6], 'Sex': b[7], 
        'Eyes': b[8], 'Hair': b[9], 'Case #': b[10], 'Description': b[11], 
        'Bond': b[12], 'Bond Type': b[13]}
        
        #add the latest info to a master tuple
        master.append(info)
In [7]:
# show a sample of our info
print(master[1])
print(master[2])
{'Name': 'TONY ALLEN COLLETT', 'Age': 44, 'Race': 'White', 'Height': '5\' 08"', 'ID': '662409', 'Bond': '$1,000', 'Case #': 'WAPELLO', 'Eyes': 'Brown', 'Description': 'OPERATING WHILE INTOXICATED - 1ST OFFENSE - OWI 1ST', 'Weight': 230, 'Bond Type': 'Cash or Surety', 'Sex': 'Male', 'Hair': 'Gray', 'Arrest date/time': '8/11/2016 8:06 PM'}
{'Name': 'TINA LOUISE GARRICK', 'Age': 44, 'Race': 'White', 'Height': '5\' 07"', 'ID': '1062131', 'Bond': '$300', 'Case #': '', 'Eyes': 'Blue', 'Description': 'POSSESSION OF DRUG PARAPHERNALIA', 'Weight': 150, 'Bond Type': 'Cash or Surety', 'Sex': 'Female', 'Hair': 'Brown', 'Arrest date/time': '8/11/2016 7:34 PM'}
In [8]:
# turn into a data frame that we can play with, and see a sample of the data
data = pd.DataFrame(master)
data[1:10].describe
Out[8]:
<bound method DataFrame.describe of    Age   Arrest date/time    Bond       Bond Type    Case #  \
1   44  8/11/2016 8:06 PM  $1,000  Cash or Surety   WAPELLO   
2   44  8/11/2016 7:34 PM    $300  Cash or Surety             
3   53  8/11/2016 6:50 PM      $0         No Bond             
4   30  8/11/2016 6:23 PM    $300  Cash or Surety             
5   25  8/11/2016 5:52 PM    $300  Cash or Surety             
6   30  8/11/2016 5:09 PM  $2,000            APPL  AG295142   
7   35  8/11/2016 5:01 PM  $2,000  Cash or Surety  AG297849   
8   32  8/11/2016 4:34 PM    $300  Cash or Surety             
9   39  8/11/2016 4:15 PM  $1,000            APPL  AG295157   

                                         Description   Eyes    Hair  Height  \
1  OPERATING WHILE INTOXICATED - 1ST OFFENSE - OW...  Brown    Gray  5' 08"   
2                   POSSESSION OF DRUG PARAPHERNALIA   Blue   Brown  5' 07"   
3  DOMESTIC ABUSE ASSAULT DISPLAY OR USE OF A WEA...   Blue  Blonde  5' 06"   
4                                   THEFT 5TH DEGREE   Blue  Blonde  5' 11"   
5             DRIVING WHILE LICENSE UNDER SUSPENSION  Brown   Brown  5' 10"   
6  UNAUTHORIZED USE OF CREDIT CARD OVER $1,000 UN...   Blue  Blonde  5' 05"   
7                                   THEFT 3RD DEGREE   Blue   Brown  6' 00"   
8                       TRESPASS -- No Damage/Injury  Hazel   Brown  6' 00"   
9                                     ASSAULT (SMMS)  Brown   Black  5' 09"   

        ID                     Name   Race     Sex  Weight  
1   662409       TONY ALLEN COLLETT  White    Male     230  
2  1062131      TINA LOUISE GARRICK  White  Female     150  
3  1004075         CARLA SUE KESTER  White  Female     150  
4    48257          RONNIE LEE HILL  White    Male     150  
5   440035      PHILLIP JOHN BRUMER  White    Male     160  
6   146762        ASHLEY LYNN NUNEZ  White  Female     155  
7   149934     LADONNA ELNORA WOODS  White  Female     270  
8    75424  JEBEDIAH CASSIDY STUMPF  White    Male     200  
9    46387    DAVID PHILLIP COLEMAN  Black    Male     200  >
In [5]:
# convert certain columns to categorical variables
cats = ['Race', 'Sex', 'Eyes', 'Hair', 'Description', 'Bond Type']
for cat in cats:
    data[cat] = data[cat].astype('category')

jailed = len(data['Race'])
race_counts = data['Race'].value_counts()
race_counts/jailed # percent of total inmates by race
Out[5]:
White               0.679487
Black               0.300699
Asian               0.017483
Pacific Islander    0.002331
dtype: float64