Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed bug #1 #2

Merged
merged 6 commits into from
Oct 10, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
# Ignore output of scraper
data.sqlite
scraperwiki.sqlite
scraperwiki.sqlite-journal

# Ignore virtual environments
envs

# Ignore IDE stuff
.idea
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
# Slovak parliament session transcripts database

Scraper created at OpenScraper Challenge 2014, improved in 2015

## Dependencies

Scrapes has few dependencies that are listed in requirements.txt file

## Scraper

This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests==2.8.0
scraperwiki==0.5.1
beautifulsoup4==4.4.1
40 changes: 27 additions & 13 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
_searchText=
_sectionLayoutContainer$ctl00$_calendarApp=nrdi
_sectionLayoutContainer$ctl00$_calendarLang=
_sectionLayoutContainer$ctl00$_calendarMonth=7
_sectionLayoutContainer$ctl00$_calendarYear=2014
_sectionLayoutContainer$ctl00$_monthSelector=7
_sectionLayoutContainer$ctl00$_yearSelector=2014
_sectionLayoutContainer$ctl00$_calendarMonth=10
_sectionLayoutContainer$ctl00$_calendarYear=2015
_sectionLayoutContainer$ctl00$_monthSelector=10
_sectionLayoutContainer$ctl00$_yearSelector=2015
_sectionLayoutContainer$ctl01$_dateFrom=
_sectionLayoutContainer$ctl01$_dateFrom$dateInput=
_sectionLayoutContainer$ctl01$_dateTo=
Expand All @@ -30,7 +30,7 @@
_sectionLayoutContainer_ctl01__dateFrom_dateInput_ClientState={"enabled":true,"emptyMessage":"","minDateStr":"1/1/1900 0:0:0","maxDateStr":"12/31/2099 0:0:0"}
_sectionLayoutContainer_ctl01__dateFrom_dateInput_text=
_sectionLayoutContainer_ctl01__dateTo_ClientState={"minDateStr":"1/1/1900 0:0:0","maxDateStr":"12/31/2099 0:0:0"}
_sectionLayoutContainer_ctl01__dateTo_calendar_AD=[[1900,1,1],[2099,12,30],[2014,7,13]]
_sectionLayoutContainer_ctl01__dateTo_calendar_AD=[[1900,1,1],[2099,12,30],[2015,10,10]]
_sectionLayoutContainer_ctl01__dateTo_calendar_SD=[]
_sectionLayoutContainer_ctl01__dateTo_dateInput_ClientState={"enabled":true,"emptyMessage":"","minDateStr":"1/1/1900 0:0:0","maxDateStr":"12/31/2099 0:0:0"}
_sectionLayoutContainer_ctl01__dateTo_dateInput_text=
Expand Down Expand Up @@ -87,8 +87,8 @@ def parse_html(html, term_nr):

# Get table rows. Class is used to highlight alternating rows, so
# get both of them.
rows = html.body.find_all('tr', attrs={'class': 'tab_zoznam_nalt'}) + \
html.body.find_all('tr', attrs={'class': 'tab_zoznam_nonalt'})
rows = html.find_all('tr', attrs={'class': 'tab_zoznam_nalt'}) + \
html.find_all('tr', attrs={'class': 'tab_zoznam_nonalt'})

data_rows = []
for row in rows:
Expand All @@ -115,9 +115,23 @@ def parse_html(html, term_nr):

# HTTP links
links = cols[4].find_all('a')
data_row['speech_video'] = links[0].attrs['href']
data_row['proceedings_video'] = links[1].attrs['href']
data_row['transcript'] = links[2].attrs['href']
try:
data_row['speech_video'] = links[0].attrs['href']
except IndexError:
print(u'Meeting no. %s (%s - %s) speech video link for %s has not been found!' %
(data_row['meeting_number'], data_row['time_from'], data_row['time_to'], data_row['member']))

try:
data_row['proceedings_video'] = links[1].attrs['href']
except IndexError:
print(u'Meeting no. %s (%s - %s) proceedings video link for %s link has not been found!' % \
(data_row['meeting_number'], data_row['time_from'], data_row['time_to'], data_row['member']))

try:
data_row['transcript'] = links[2].attrs['href']
except IndexError:
print(u'Meeting no. %s (%s - %s) transcript link for %s has not been found!' % \
(data_row['meeting_number'], data_row['time_from'], data_row['time_to'], data_row['member']))

data_rows.append(data_row)

Expand All @@ -142,7 +156,7 @@ def main():
response = session.get(url)
if not response.ok:
raise Exception("Failed to fetch %s" % url)
html = BeautifulSoup(response.text)
html = BeautifulSoup(response.text, "html.parser")
# Electoral term numbers.
term_numbers = get_term_numbers(html)

Expand All @@ -157,7 +171,7 @@ def main():
response = session.post(url, data=params)
if not response.ok:
raise Exception("Failed to fetch %s" % url)
html = BeautifulSoup(response.text)
html = BeautifulSoup(response.text, "html.parser")
save_results(parse_html(html, term_nr), 1)

for page_nr in itertools.count(2):
Expand All @@ -178,7 +192,7 @@ def main():
raise Exception("Failed to fetch %s", url)

# Parse and save data.
html = BeautifulSoup(response.text)
html = BeautifulSoup(response.text, "html.parser")
data = parse_html(html, term_nr)
save_results(data, page_nr)

Expand Down