Code from part one of the polk county inmates series
Polk County Inmates - Part 1¶
In [9]:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 01 18:15:05 2016
Polk County arrest records crawler
@author: Tammy
"""
import urllib2
from bs4 import BeautifulSoup
import re
import pandas as pd
# define base URL for county website
base = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
# empty tuple that we will fill with dicts later
master = []
# get the link IDs for each inmate page so that we can iterate over each
url = urllib2.urlopen(base)
doc = url.read()
src = BeautifulSoup(doc, 'html.parser')
arrests = src.find_all('a') # find all link tags on our table of contents page
pattern = re.compile(".*(bi)") # the pattern we're looking for in the links
# iterate through each link, check that it's the type we want, grab the rest of
# the URL, then parse through each arrest page to grab info
for arrest in range(1, len(arrests)):
if pattern.match(arrests[arrest]['href']) :
page = arrests[arrest]['href']
site = base + page
url2 = urllib2.urlopen(site)
doc2 = url2.read()
src2 = BeautifulSoup(doc2, 'html.parser')
# create a list of all td tags on the arrest record page
td = src2.find_all('td')
# clean up the td tag strings
b = list()
for t in td:
b.append(str(t.string.strip()))
# define a dictionary of all the info housed in the tags
info = {'ID': b[0], 'Name': b[1], 'Arrest date/time': b[2], 'Age': int(b[3]),
'Height': b[4], 'Weight': int(b[5]), 'Race': b[6], 'Sex': b[7],
'Eyes': b[8], 'Hair': b[9], 'Case #': b[10], 'Description': b[11],
'Bond': b[12], 'Bond Type': b[13]}
#add the latest info to a master tuple
master.append(info)
In [7]:
# show a sample of our info
print(master[1])
print(master[2])
In [8]:
# turn into a data frame that we can play with, and see a sample of the data
data = pd.DataFrame(master)
data[1:10].describe
Out[8]:
In [5]:
# convert certain columns to categorical variables
cats = ['Race', 'Sex', 'Eyes', 'Hair', 'Description', 'Bond Type']
for cat in cats:
data[cat] = data[cat].astype('category')
jailed = len(data['Race'])
race_counts = data['Race'].value_counts()
race_counts/jailed # percent of total inmates by race
Out[5]: