-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
132 lines (109 loc) · 6.32 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
###############################################################################
# This scraper was written to collect the reported crime listed for New York City
# and all of the individual precincts from the NYPD's website contained in the
# pdf files.
# This scraper is based on an earlier scraper by Andrew Wheeler.
###############################################################################
import scraperwiki
from BeautifulSoup import BeautifulSoup
import re
def precinct_number(area):
if (area == "Central Park Precinct"):
area = "22nd Precinct"
if (area == "Midtown Precinct South"):
area = "14th Precinct"
if (area == "Midtown Precinct North"):
area = "18th Precinct"
for i in range(len(area)):
if area[i].isdigit() == False:
if i > 0:
return int(area[0:i])
else:
break
return ""
#This commented section imports historical numbers from Andy's scraper
scraperwiki.sqlite.attach("current-week-reported-crime-city-wide-and-for-prec", "src")
'''
# Get one week of data for testing purposes (cmb)
results = scraperwiki.sqlite.select("`area`, `week`, `murder`, `rape`, `robbery`, `fa` as `felony_assault`, `burg` as `burglary`, `lar` as `grand_larceny`, `mvt` as `grand_larceny_auto`, `volume` as `edition` FROM src.swdata")
for result in results:
dates = result['week'].split() #Report Covering the Week <start> Through <end>
result['start_date'] = dates[4]
result['end_date'] = dates[6]
del result['week']
result['precinct'] = precinct_number(result['area'])
print result
scraperwiki.sqlite.save(unique_keys=['area','end_date'],data=results,table_name="CrimeStatistics")
'''
# define the order our columns are displayed in the datastore
scraperwiki.sqlite.execute("CREATE TABLE IF NOT EXISTS `CrimeStatistics` (`edition` text ,`area` text,`start_date` text,`end_date` text,`murder` integer,`rape` integer,`robbery` integer,`felony_assault` integer,`burglary` integer,`grand_larceny` integer,`grand_larceny_auto` integer,`petit_larceny` integer,`misdemeanor_assault` integer,`misdemeanor_sex_crimes` integer,`shooting_victim` integer,`shooting_incident` integer)")
#If this is the first PDF, update the current week in the database, if necessary
global first
first = True
def string_cleaner(data):
data = data.split()[0].replace(",","")
return data
# scrape_table function: gets passed to an individual page to scrape
def scrape_table(soup_pdf):
global first
tds = soup_pdf.findAll('text') # get all the <text> tags
tds_text = [None]*len(tds)
for i in range(len(tds)): #changing my tags to the text within the tag
tds_text[i] = tds[i].text
data = {} #because all I have to go on is the relative location of numbers, I search for specific strings and then
#take the number in the string afterwords, if the format of the pdf changes, theres a good chance this wont work
data['murder'] = string_cleaner(tds_text[tds_text.index('Murder') + 1])
data['rape'] = string_cleaner(tds_text[tds_text.index('Rape') + 1])
data['robbery'] = string_cleaner(tds_text[tds_text.index('Robbery') + 1])
data['felony_assault'] = string_cleaner(tds_text[tds_text.index('Fel. Assault') + 1])
data['burglary'] = string_cleaner(tds_text[tds_text.index('Burglary') + 1])
data['grand_larceny'] = string_cleaner(tds_text[tds_text.index('Gr. Larceny') + 1])
data['grand_larceny_auto'] = string_cleaner(tds_text[tds_text.index('G.L.A.') + 1])
data['petit_larceny'] = string_cleaner(tds_text[tds_text.index('Petit Larceny') + 1])
data['misdemeanor_assault'] = string_cleaner(tds_text[tds_text.index('Misd. Assault') + 1])
data['misdemeanor_sex_crimes'] = string_cleaner(tds_text[tds_text.index('Misd. Sex Crimes') + 1])
#data['shooting_victim'] = string_cleaner(tds_text[tds_text.index('Shooting Vic.') + 1])
#data['shooting_incident'] = string_cleaner(tds_text[tds_text.index('Shooting Inc.') + 1])
data['edition'] = str(soup_pdf.find(text=re.compile('Volume')))
dates = str(soup_pdf.find(text=re.compile('Report Covering the Week'))).split() #Report Covering the Week <start> Through <end>
data['start_date'] = dates[4]
data['end_date'] = dates[6]
if tds[8].text == 'Patrol Borough': #this is because for the pdfs for the borough crime the relative location of the area name changes
data['area'] = tds[9].text
else:
data['area'] = tds[8].text
data['precinct'] = precinct_number(data['area'])
scraperwiki.sqlite.save(unique_keys=['area','end_date'],data=data,table_name="CrimeStatistics")
# If this is the first time though the loop
if first == True:
first = False
# Update the information about the scrape
stored_end_date = scraperwiki.sqlite.get_var('current_end_date')
if stored_end_date != data['end_date']:
scraperwiki.sqlite.save_var('previous_end_date', stored_end_date)
scraperwiki.sqlite.save_var('current_end_date', data['end_date'])
# scrape_and_look_for_next_link function: calls the scrape_table
# what I do is populate an array with all of the pdf links, and then loop through all of those links
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html)
find_link = soup.findAll(href=re.compile("/downloads/pdf/crime_statistics/"))
next_link = [None]*(len(find_link))
rep_link = [None]*(len(find_link))
for i in range(len(find_link)):
next_link[i] = find_link[i]['href']
rep_link[i] = next_link[i].replace('../..','http://www.nyc.gov/html/nypd')
for i in range(len(rep_link)):
a = scraperwiki.scrape(rep_link[i]) #here I call my previously defined function to convert and scrape the pdf
soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a))
scrape_table(soup_pdf)
#here I put it all together and call it all at once
url = 'http://www.nyc.gov/html/nypd/html/crime_prevention/crime_statistics.shtml'
scrape_and_look_for_next_link(url)
'''
#Test one PDF - convert and scrape the pdf
#a = scraperwiki.scrape("http://www.nyc.gov/html/nypd/downloads/pdf/crime_statistics/cscity.pdf") #the citywide numbers
a = scraperwiki.scrape("http://www.nyc.gov/html/nypd/downloads/pdf/crime_statistics/cs001pct.pdf") # precinct one
soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a))
scrape_table(soup_pdf)
'''