-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_bookplates_one_time.py
195 lines (165 loc) · 7.29 KB
/
update_bookplates_one_time.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import csv
import argparse
import logging
from alma_api_keys import API_KEYS
from alma_api_client import AlmaAPIClient
from alma_analytics_client import AlmaAnalyticsClient
from alma_marc import get_pymarc_record_from_bib, prepare_bib_for_update
from pymarc import Field
def get_mms_report(analytics_api_key: str) -> list:
"""Get the report of MMS IDs and current 966 contents from Alma Analytics."""
# analytics only available in prod environment
aac = AlmaAnalyticsClient(analytics_api_key)
report_path = (
"/shared/University of California Los Angeles (UCLA) 01UCS_LAL"
"/Cataloging/Reports/API/MMS IDs for 966 updates"
)
aac.set_report_path(report_path)
report = aac.get_report()
return report
def get_spac_mappings(input_file: str) -> list:
"""Get SPAC mappings from a CSV file. Filter out any lines without a valid URL."""
spac_mappings = []
with open(input_file, newline="", encoding="utf-8-sig") as csv_file:
reader = csv.DictReader(csv_file)
for line in reader:
# remove leading/trailing whitespace from all values
line = {k: v.strip() for k, v in line.items()}
# first, check if the line has a valid URL. If not, skip it.
if not line["URL"]:
continue
elif line["URL"][:4] != "http":
continue
else:
current_line = {
"SPAC": line["SPAC"],
"NAME": line["NAME"],
"URL": line["URL"],
}
spac_mappings.append(current_line)
return spac_mappings
def needs_bookplate_update(old_field: Field, spac_mappings: list) -> bool:
"""Check if a 966 field matches a SPAC code. If so, assume it needs updating."""
# check if the SPAC code in the 966 field is in the list of SPAC mappings
for spac_mapping in spac_mappings:
if spac_mapping["SPAC"] == old_field.get_subfields("a")[0]:
return True
return False
def get_spac_info(spac_mappings: list, field_966: Field) -> dict:
"""Get the SPAC name and URL from the mappings to update a 966."""
for spac_mapping in spac_mappings:
if spac_mapping["SPAC"] == field_966.get_subfields("a")[0]:
return {"spac_name": spac_mapping["NAME"], "spac_url": spac_mapping["URL"]}
return {"spac_name": "", "spac_url": ""}
def update_existing_966(field_966: Field, spac_name: str, spac_url: str) -> None:
"""Update the URL and bookplate text in an existing 966 field."""
# update $b for bookplate text
field_966.delete_subfield("b")
field_966.add_subfield("b", spac_name)
# update $c for URL
field_966.delete_subfield("c")
# if spac_url is an empty string, don't add $c back in
if spac_url:
field_966.add_subfield("c", spac_url)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"spac_mappings_file", help="Path to the SPAC mappings .csv file"
)
parser.add_argument(
"environment",
help="Alma environment (sandbox or production), or 'test' for a small test set.",
)
parser.add_argument(
"--log-level",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default="INFO",
help="Set the logging level",
)
parser.add_argument(
"--start-index", type=int, help="Start processing report data at this index"
)
args = parser.parse_args()
logging.basicConfig(filename="update_bookplates_one_time.log", level=args.log_level)
# always suppress urllib3 logs with lower level than WARNING
logging.getLogger("urllib3").setLevel(logging.WARNING)
if args.environment == "test":
# test data for sandbox environment
report = [
{"MMS Id": "9911656853606533"},
{"MMS Id": "9990572683606533"},
]
alma_api_key = API_KEYS["SANDBOX"]
elif args.environment == "sandbox":
# use production analytics key for sandbox environment, since sandbox doesn't have analytics
analytics_api_key = API_KEYS["DIIT_ANALYTICS"]
alma_api_key = API_KEYS["SANDBOX"]
report = get_mms_report(analytics_api_key)
elif args.environment == "production":
analytics_api_key = API_KEYS["DIIT_ANALYTICS"]
alma_api_key = API_KEYS["DIIT_SCRIPTS"]
report = get_mms_report(analytics_api_key)
# if a start index is provided, slice the report to start at that index
if args.start_index:
report = report[args.start_index :]
logging.info(f"Beginning processing {len(report)} bib e-bookplates")
client = AlmaAPIClient(alma_api_key)
spac_mappings = get_spac_mappings(args.spac_mappings_file)
# initialize counters
total_bibs_updated = 0
total_bibs_skipped = 0
total_bibs_errored = 0
report_index = 0
for item in report:
mms_id = item["MMS Id"]
bib_was_updated = False
# get bib from Alma
alma_bib = client.get_bib(mms_id).get("content")
# check for error in bib response, usually due to invalid MMS ID
if b"errorsExist" in alma_bib:
logging.error(
f"Got an error finding bib record for MMS ID {mms_id}. Skipping this record."
)
total_bibs_errored += 1
continue
# if we get a bad response, halt the script
# report is sorted by MMS ID, so we can use this to resume later if needed
if not alma_bib:
logging.error(
f"Unexpected response for MMS ID {mms_id}, index {report_index}. Exiting."
)
exit()
# convert to Pymarc to handle fields and subfields
pymarc_record = get_pymarc_record_from_bib(alma_bib)
for field_966 in pymarc_record.get_fields("966"):
if needs_bookplate_update(field_966, spac_mappings):
# get the SPAC name and URL from the mappings
spac_info = get_spac_info(spac_mappings, field_966)
spac_name = spac_info["spac_name"]
spac_url = spac_info["spac_url"]
update_existing_966(field_966, spac_name, spac_url)
logging.debug(
f"Updated bookplate. MMS ID: {mms_id}, SPAC Name: {spac_name}",
)
bib_was_updated = True
if bib_was_updated:
new_alma_bib = prepare_bib_for_update(alma_bib, pymarc_record)
client.update_bib(mms_id, new_alma_bib)
total_bibs_updated += 1
else:
# this case shouldn't happen, since report is limited to records that need updating
# log it in case it does
total_bibs_skipped += 1
logging.info(f"Skipping MMS ID {mms_id}. No 966 updates needed.")
report_index += 1
# every 1% of records, log progress
# Take 1%, round down, add 1 to avoid 0 when length < 100
progress_interval = (len(report) // 100) + 1
if report_index % progress_interval == 0:
logging.info(f"Processed {report_index} bibs. Last MMS ID: {mms_id}")
logging.info("Finished adding ebookplates.")
logging.info(f"{total_bibs_updated} bibs updated.")
logging.info(f"{total_bibs_skipped} bibs skipped with no 966 updates needed.")
logging.info(f"{total_bibs_errored} bibs skipped due to errors.")
if __name__ == "__main__":
main()