-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathnpn.py
150 lines (128 loc) · 6.16 KB
/
npn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#retriever
"""Retriever script for National Phenology Network data
The dataset contains observation data retrieved from start date to current date
date format YYYY-MM-DD
Data having a value -9999 or "-9999" are considered 'null' or 'empty
Data from the API is xml having both taxa(plantae and animalia)
"""
from future import standard_library
standard_library.install_aliases()
from builtins import str
from retriever.lib.templates import Script
from retriever.lib.models import Table
from pkg_resources import parse_version
import xml.etree.ElementTree as ET
import datetime
try:
from retriever.lib.defaults import VERSION, DATA_WRITE_PATH
try:
from retriever.lib.tools import open_fw, open_csvw
except ImportError:
from retriever.lib.scripts import open_fw, open_csvw
except ImportError:
from retriever import open_fw, open_csvw, DATA_WRITE_PATH, VERSION
class main(Script):
def __init__(self, **kwargs):
Script.__init__(self, **kwargs)
self.title = "USA National Phenology Network"
self.name = "npn"
self.retriever_minimum_version = '2.0.dev'
self.version = '2.1.4'
self.ref = "http://www.usanpn.org/results/data"
self.keywords = ["Data Type > Phenology", "Spatial Scale > Continental"]
self.description = "The data set was collected via Nature's Notebook phenology observation program (2009-present), and (2) Lilac and honeysuckle data (1955-present)"
self.citation = "Schwartz, M. D., Ault, T. R., & J. L. Betancourt, 2012: Spring Onset Variations and Trends in the Continental USA: Past and Regional Assessment Using Temperature-Based Indices. International Journal of Climatology (published online, DOI: 10.1002/joc.3625)."
if parse_version(VERSION) <= parse_version("2.0.0"):
self.shortname = self.name
self.name = self.title
self.tags = self.keywords
def download(self, engine=None, debug=False):
Script.download(self, engine, debug)
engine = self.engine
csv_files = []
request_src = "http://www.data-retriever.org/"
base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
header_values = ["observation_id",
"update_datetime",
"site_id",
"latitude",
"longitude",
"elevation_in_meters",
"state",
"species_id",
"genus",
"species",
"common_name",
"kingdom",
"individual_id",
"phenophase_id",
"phenophase_description",
"observation_date",
"day_of_year",
"phenophase_status",
"intensity_category_id",
"intensity_value",
"abundance_value"
]
columns = [("record_id", ("pk-auto",)),
("observation_id", ("int",)), # subsequently refered to as "status record"
("update_datetime", ("char",)),
("site_id", ("int",)),
("latitude", ("double",)),
("longitude", ("double",)),
("elevation_in_meters", ("char",)),
("state", ("char",)),
("species_id", ("int",)),
("genus", ("char",)),
("species", ("char",)),
("common_name", ("char",)),
("kingdom", ("char",)), # skip kingdom
("individual_id", ("char",)),
("phenophase_id", ("int",)),
("phenophase_description", ("char",)),
("observation_date", ("char",)),
("day_of_year", ("char",)),
("phenophase_status", ("char",)),
("intensity_category_id", ("char",)),
("intensity_value", ("char",)),
("abundance_value", ("char",))
]
start_date = datetime.date(2009, 1, 1)
end_date = datetime.date.today()
while start_date < end_date:
to_date = start_date + datetime.timedelta(90)
if to_date >= end_date:
data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date),
request_src=request_src)
else:
data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date),
request_src=request_src)
xml_file_name = '{}'.format(start_date) + ".xml"
engine.download_file(data_url, xml_file_name)
# Create csv files for 3 months
csv_observation = '{}'.format(start_date) + ".csv"
csv_files.append(csv_observation)
csv_buff = open_fw(engine.format_filename(csv_observation))
csv_writer = open_csvw(csv_buff)
csv_writer.writerow(header_values)
# Parse xml to read data
file_read = ""
fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
with open(fname, 'r') as fp1:
file_read = fp1.read()
root = ET.fromstring(file_read)
for elements in root:
index_map = {val: i for i, val in enumerate(header_values)}
diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]])
csv_writer.writerow([x[1] for x in diction])
csv_buff.close()
start_date = to_date + datetime.timedelta(1)
# Create table
table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True)
table.columns = columns
engine.table = table
engine.create_table()
for data_file in csv_files:
engine.insert_data_from_file(engine.find_file(data_file))
return engine
SCRIPT = main()