A Shapefile of Iowa City buildings, and those for all of Johnson County, IA is easy to obtain from a public repository. But, the age/build year of those buildings is not in the data. Originally, I was going to do all of Johnson County, IA but that proved to be very slow to load so I opted to do just Iowa City - which presented a few problems
########################################################################
#Input Variable loading and environment declaration
########################################################################
parcels = arcpy.GetParameterAsText(0)
url = 'http://iowacity.iowaassessors.com/parcel.php?parcel='
page = urllib2.urlopen(url)
soup = bs(page.read())
if 'YearBuilt' not in [f.name for f in arcpy.ListFields(parcels)]:
arcpy.AddField_management(parcels,'YearBuilt','TEXT')
parcels_fields = ['PPN','YearBuilt','PropClass']
This little bit sets up the Shapefile dbf for reading in data and adding it to the appropriate field
def makeSoup(ppn):
temp = url + ppn
page = urllib2.urlopen(temp)
return bs(page.read())
def getYrBuilt(soup,PropClass):
#get prop class of a parcel
if 'c' in PropClass.lower() or 'i' in PropClass.lower():
name = 'commercialData'
dataIndex = 1
else:
name = 'residentialData'
dataIndex = 2
#try to pull year built
try:
table = soup.find('div',{'class': name})
data = table.findAll('div')
date = data[dataIndex].text
except:
date = 'unknown'
return date
Some simple functions to pull the data I need from the HTML generated in the "makeSoup function. Next it was a simple matter of just iterating through lines in the Shapefile data using an update cursor.
featureCount = int(arcpy.GetCount_management(parcels).getOutput(0))
arcpy.SetProgressor('step','finding build years....',0,featureCount,1)
Query = "YearBuilt = 'unknown'"
with arcpy.da.UpdateCursor(parcels,parcels_fields,Query) as cursor:
for row in cursor:
soup = makeSoup(row[0])
date = getYrBuilt(soup,row[2])
row[1] = str(date)
arcpy.SetProgressorLabel('Parcel: {0}, year: {1}'.format(row[0],date))
cursor.updateRow(row)
arcpy.SetProgressorPosition()
arcpy.ResetProgressor()
arcpy.AddMessage('Process Complete')
One thing I did not account for was the getting flagged for data mining. As I was testing snippets of code using small lists of parcel numbers I apparently didn't set off any triggers on the assessors website. But, once I tried to pull all the data for every building within the Iowa City limits they flagged me and I had to contact a representative from the assessors office to get the data I wanted - delaying the simple project a couple days.No biggie, but they still have me blocked...
Of course University of Iowa Buildings weren't available from the assessor. Thankfully, the University was eager to share a facilities web page with construction years for all their buildings. I used a simple web scraper script to make this go quicker. I still had to do some manual editing...
#import modules
#import modules
from BeautifulSoup import BeautifulSoup as bs
from BeautifulSoup import SoupStrainer
import urllib2
import re
import csv
import time
import string
import random
from collections import defaultdict
from operator import itemgetter
#declare functions
#---------------------------
#get address and build year
baseUrl = 'http://www.facilities.uiowa.edu/bldg/bldg-list.asp?submenuheader=3'
page = urllib2.urlopen(baseUrl)
soup = bs(page.read())
base = 'http://www.facilities.uiowa.edu/bldg/'
urlList = []
def getUrls(base,soup,urlList):
table = soup.findAll('td', {'width': '294'})
for i in table:
if len(i) > 1:
href = i.find('a').get('href')
href = base + href
urlList.append(href)
return urlList
def getBuildNo(url):
ind = url.index('=') + 1
num = url[ind:]
return num
#get address
def getAddress(soup):
items = soup.findAll('tr')[4]
address = items.findAll('td')[1]
address = address.contents[0]
address = address.renderContents()
address = address.replace('\r\n',' ')
address = address.split()
address = (' ').join(address)
return address
def getYear(soup):
year = soup.findAll('tr')[7]
year = year.findAll('td')[1]
year = year.renderContents()
return year
#create list of URLS
urls = getUrls(base,soup,urlList)
#List of buildings that will have lists
buildings = []
for url in urls:
print 'working on {0}...'.format(url)
time.sleep(random.random())
page = urllib2.urlopen(url)
soup = bs(page.read())
address = getAddress(soup)
year = getYear(soup)
Num = getBuildNo(url)
buildings.append([url,address,year])
#write output to CSV
csvHeader = ['Building Number', 'Address','Build_year']
with open('UiowaBuildings_2.csv','wb') as buildingCSV:
writer = csv.writer(buildingCSV)
writer.writerow(csvHeader)
for i in buildings:
data = [i[0],i[1],i[2]]
writer.writerow(data)
According the the assessor any structure without a known construction date (or one that predates 1930) and a limestone foundation they just assign a construction year of 1900. This creates an artificial "boom" in construction at the turn of the century. To "disperse" the boom I created a script in ArcGIS that looked at every neighbor within 200 feet of each structure with a build year of 1900. The median build year of neighboring structures was then assigned to the building in question. Here's the code I wrote for that
###################################################################################################
#Import python modules
###################################################################################################
import os
import arcpy
from arcpy import env
from itertools import *
from operator import itemgetter
from collections import defaultdict
import numpy
from random import randint
################################################################################################
#Input Variable loading and environment declaration
################################################################################################
buildings = arcpy.GetParameterAsText(0) #feature class of buildings
featureCount = int(arcpy.GetCount_management(buildings).getOutput(0)) # get a count of rows in dyad table.
################################################################################################
# Defining global functions
################################################################################################
buildingIDs = []
randDict = {}
################################################################################################
#Global variables to be used in process
################################################################################################
#create field lists
fields = [u'OBJECTID',u'YearBuilt']
#building IDs and dictionary to track new build year
layer = arcpy.MakeFeatureLayer_management(buildings,"Temporary_Layer")
################################################################################################
#get a list of all the buildings with a build year of 1900 and the corresponding object IDs to select them
################################################################################################
arcpy.SetProgressor("step","creating list of buildings with build year 1900 ",0,featureCount,1)
with arcpy.da.SearchCursor(buildings, fields) as cursor:
for row in cursor:
if int(row[1]) == 1900:
buildingIDs.append(row[0])
arcpy.SetProgressorPosition() #update progressor positon
arcpy.AddMessage("{0} buildings found with build year of 1900".format((len(buildingIDs))))
################################################################################################
#update Assigned_To field with matches in the seed List
################################################################################################
arcpy.SetProgressor("step","Iterating through buildings...",0,len(buildingIDs),1)
for building in buildingIDs:
#query to select
query = "OBJECTID = {0}".format(building)
#update progressor label
arcpy.SetProgressorLabel('creating selection for {0}'.format(building))
#create selection for specific building
selection = arcpy.SelectLayerByAttribute_management(layer,"NEW_SELECTION",query)
arcpy.SetProgressorLabel('selecting buildings within 200 feet of {0}'.format(building))
#get neighbors within 200 feet of selection
neighbors = arcpy.SelectLayerByLocation_management(selection,"WITHIN_A_DISTANCE",layer,"200 feet","NEW_SELECTION")
neighborYears = []
with arcpy.da.SearchCursor(neighbors,fields) as cursor:
for row in cursor:
neighborYears.append(int(row[1]))
#remove duplicate build years
neighborYears = list(set(neighborYears))
avg = sum(neighborYears)/len(neighborYears)
med = int(numpy.median(numpy.array(neighborYears)))
#if median is 1900 randomly assign a construction year
if med == 1900:
med = randint(1880,1930)
#create entry in dictionary
randDict[building] = med
arcpy.SetProgressorLabel('{0} avg build year: {1}, median build year: {2}'.format(building,avg,med))
arcpy.SetProgressorPosition()
################################################################################################
#Use udpate cursor to update the entries with new build years
################################################################################################
arcpy.SetProgressor('step','updating building years for {0} buildings'.format(len(buildingIDs)),0,featureCount,1)
with arcpy.da.UpdateCursor(buildings,fields) as cursor:
for row in cursor:
try:
row[1] = str(int(randDict[row[0]]))
cursor.updateRow(row)
except KeyError:
pass
arcpy.SetProgressorPosition()
################################################################################################
#Final Output and cleaning of temp data/variables
################################################################################################
del layer
arcpy.AddMessage("Process complete!")
There's still some inaccuracies in the data, but it's sufficient for gaining an idea of how and when Iowa City has expanded.
The final map output can be found here.