-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess-kenya-subjects.py
72 lines (55 loc) · 2.36 KB
/
process-kenya-subjects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Process Subjects (WildWatch Kenya)
# ----------------------------------
#
# Reads a Zooniverse Subject export CSV file and makes it ready to be added to
# the map database. Usually, this is used to extract information buried in the
# subjects.locations and subjects.metadata fields, and bring that info to the
# top-level.
#
# Usage:
# python3 process-kenya-subjects.py inputFilename.csv outputFilename.csv
#
# (@shaunanoordin 20210322)
import sys, csv, json, re
if len(sys.argv) < 3:
print('ERROR: not enough arguments')
sys.exit()
inputFilename = sys.argv[1]
outputFilename = sys.argv[2]
with open(inputFilename, mode='r', newline='') as inputFile, \
open(outputFilename, mode='w+', newline='') as outputFile:
# Read the input CSV
inputReader = csv.DictReader(inputFile)
# Prepare the output CSV
fieldnames = ['subject_id', 'kenya_id', 'camera', 'location', 'subject_set_id', 'workflow_id', 'project_id', 'camera_name', 'year', 'month']
outputWriter = csv.DictWriter(outputFile, fieldnames=fieldnames)
outputWriter.writeheader()
for inRow in inputReader:
# Make a copy of the input row
outRow = {}
for col in fieldnames:
if col in inRow.keys():
outRow[col] = inRow[col]
# Prepare JSON fields
locations = json.loads(inRow['locations'])
metadata = json.loads(inRow['metadata'])
# --------------------------------
# Extract project-specific information
metadata_name = re.sub('_(\d)+\.JPG$', '', metadata['name'])
tmp = re.search('(^|_|-)([A-Z]+[0-9]+)(_|-)', metadata_name)
camera_name = tmp.group(2) if tmp else ''
tmp = re.search('(^|_|-)(20\d\d)(_|-)', metadata_name)
year = tmp.group(2) if tmp else ''
tmp = re.search('(^|_|-)(jan|january|feb|february|mar|march|apr|april|may|jun|june|jul|july|aug|august|sep|sept|september|oct|october|nov|november|dec|december)(_|-)', metadata_name, re.IGNORECASE)
month = tmp.group(2).upper()[0:3] if tmp else ''
# Save the project-specific information
outRow['kenya_id'] = metadata['name']
outRow['camera'] = camera_name + '_' + year + '_' + month
outRow['location'] = locations['0']
outRow['camera_name'] = camera_name
outRow['year'] = year
outRow['month'] = month
# --------------------------------
# Print the output row
outputWriter.writerow(outRow)
print(outRow)