From 9219a44f8bfbe99f52da13104006205088acc3b0 Mon Sep 17 00:00:00 2001 From: mas400 Date: Wed, 28 Jun 2017 16:34:01 -0400 Subject: [PATCH] updated scripts to use command line arguments --- .../dats_to_doi/create_spew_mapping.py | 59 ++-- .../dats_to_doi/update_dats_with_doi.py | 109 ++++---- .../dats_to_doi/upload_dats_to_zenodo.py | 261 +++++++++--------- 3 files changed, 223 insertions(+), 206 deletions(-) diff --git a/src/scripts/dats_to_doi/create_spew_mapping.py b/src/scripts/dats_to_doi/create_spew_mapping.py index e840f596..106d70a3 100644 --- a/src/scripts/dats_to_doi/create_spew_mapping.py +++ b/src/scripts/dats_to_doi/create_spew_mapping.py @@ -4,40 +4,47 @@ import json import csv import re +import sys -dats_folder = 'DATS FOLDER LOCATION' -if os.path.isfile('spew_mapping.csv'): - file = open('spew_mapping.csv', 'a+') - interval = sum(1 for line in open('spew_mapping.csv'))-1 - wr = csv.writer(file, quoting=csv.QUOTE_ALL) +if len(sys.argv) < 2: + print('DESCRIPTION:\n\tReads SPEW metadata and creates a CSV mapping of the location code, SPEW version, and landing page URL to an anonymous identifier\n\tA directory path is required\n') + print('USAGE:\n\tpython create_spew_mapping.py \n') else: - interval = 0 - file = open('spew_mapping.csv', 'w') - wr = csv.writer(file, quoting=csv.QUOTE_ALL) - wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title']) + dats_folder = sys.argv[1] -for filename in os.listdir(dats_folder): - if filename.endswith(".json"): - interval += 1 + if os.path.isfile('spew_mapping.csv'): + file = open('spew_mapping.csv', 'a+') + interval = sum(1 for line in open('spew_mapping.csv'))-1 + wr = csv.writer(file, quoting=csv.QUOTE_ALL) - # Read metadata as json - with open(os.path.join(dats_folder, filename)) as json_file: - json_data = json.load(json_file); + else: + interval = 0 + file = open('spew_mapping.csv', 'w') + wr = csv.writer(file, quoting=csv.QUOTE_ALL) + wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title']) - # Get title - title = json_data['title'] + for filename in os.listdir(dats_folder): + if filename.endswith(".json"): + interval += 1 - # Get landing page - landing_page = json_data['distributions'][0]['access']['landingPage'] + # Read metadata as json + with open(os.path.join(dats_folder, filename)) as json_file: + json_data = json.load(json_file); - # Get apollo location code - ls_url = json_data['spatialCoverage'][0]['identifier']['identifier'] - location_code = int(re.search(r'\d+', ls_url).group()) + # Get title + title = json_data['title'] - # Get spew version - version = json_data['types'][2]['platform']['value'] + # Get landing page + landing_page = json_data['distributions'][0]['access']['landingPage'] - wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title]) + # Get apollo location code + ls_url = json_data['spatialCoverage'][0]['identifier']['identifier'] + location_code = int(re.search(r'\d+', ls_url).group()) -file.close() \ No newline at end of file + # Get spew version + version = json_data['types'][2]['platform']['value'] + + wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title]) + + file.close() \ No newline at end of file diff --git a/src/scripts/dats_to_doi/update_dats_with_doi.py b/src/scripts/dats_to_doi/update_dats_with_doi.py index 77f43fa0..e0ea6896 100644 --- a/src/scripts/dats_to_doi/update_dats_with_doi.py +++ b/src/scripts/dats_to_doi/update_dats_with_doi.py @@ -2,68 +2,73 @@ import csv import json import collections +import sys -ACCESS_TOKEN = 'SvxcV0O7kHohjkBVHcHZ3iZmgtJvKeZPN85ZFtgrc5wa0Uup1MtYWl2HzWTw' -dats_folder = '/Users/amd176/Documents/Repositories/digital-commons/src/scripts/convert_to_dats/output/spew_ipums_dats_json/' +if len(sys.argv) < 3: + print('DESCRIPTION:\n\tUpdate the DATS metadata with the DOI and PURL\n\tA Zenodo access token and directory path are required\n') + print('USAGE:\n\tpython update_dats_with_doi.py \n') +else: + ACCESS_TOKEN = sys.argv[1] + dats_folder = sys.argv[2] -data = csv.reader(open('spew_mapping.csv')) -# Read the column names from the first line of the file -fields = next(data) -csv_dict = {} -for row in data: - # Zip together the field names and values - items = zip(fields, row) - item = {} - key = () - # Add the value to our dictionary - for (name, value) in items: - item[name] = value.strip() + data = csv.reader(open('spew_mapping.csv')) + # Read the column names from the first line of the file + fields = next(data) + csv_dict = {} + for row in data: + # Zip together the field names and values + items = zip(fields, row) + item = {} + key = () + # Add the value to our dictionary + for (name, value) in items: + item[name] = value.strip() - key = item['Title'] - csv_dict[key] = item + key = item['Title'] + csv_dict[key] = item -response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'}) -json_response = response.json() -for deposition_index in range(len(json_response)): - id = json_response[deposition_index]['id'] + response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'}) + json_response = response.json() + for deposition_index in range(len(json_response)): + id = json_response[deposition_index]['id'] - r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id), - params={'access_token': ACCESS_TOKEN}) + r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id), + params={'access_token': ACCESS_TOKEN}) - deposition_json = r.json() + deposition_json = r.json() - # Get download link for access url - access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename'] + # Get download link for access url + access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename'] - # Get title to cross reference with spew_mapping.csv - title = deposition_json['title'] - if not "RABIES" in title.upper() and not "H1N1" in title: - try: - landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier'] - except KeyError: - continue + # Get title to cross reference with spew_mapping.csv + title = deposition_json['title'] + if not "RABIES" in title.upper() and not "H1N1" in title: + try: + landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier'] + except KeyError: + continue - # Extract the name from the landing page in spew_mapping, this will allow us to access the json file - file_name = () - old_landing_page = csv_dict[title]['Landing Page'].split('/') - if len(old_landing_page) > 10: - file_name = old_landing_page[8] + ".json" - else: - file_name = old_landing_page[7] + ".json" + # Extract the name from the landing page in spew_mapping, this will allow us to access the json file + file_name = () + old_landing_page = csv_dict[title]['Landing Page'].split('/') + if len(old_landing_page) > 10: + file_name = old_landing_page[8] + ".json" + else: + file_name = old_landing_page[7] + ".json" - # Update the dats file with the correct identifier information and the access and landing URLs - try: - with open(dats_folder+file_name) as json_file: - old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict) - except FileNotFoundError: - continue + # Update the dats file with the correct identifier information and the access and landing URLs + try: + with open(dats_folder+file_name) as json_file: + old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict) + except FileNotFoundError: + continue - old_meta_data['identifier']['identifier'] = deposition_json['doi_url'] - old_meta_data['identifier']['identifierSource'] = "zenodo" - old_meta_data['distributions'][0]['access']['accessURL'] = access_url - old_meta_data['distributions'][0]['access']['landingPage'] = landing_url + old_meta_data['identifier']['identifier'] = deposition_json['doi_url'] + old_meta_data['identifier']['identifierSource'] = "zenodo" + old_meta_data['distributions'][0]['access']['accessURL'] = access_url + old_meta_data['distributions'][0]['access']['landingPage'] = landing_url - with open(dats_folder+file_name, 'w') as outfile: - json.dump(old_meta_data, outfile, indent=4) - print("created " + file_name) \ No newline at end of file + with open(dats_folder+file_name, 'w') as outfile: + json.dump(old_meta_data, outfile, indent=4) + print("created " + file_name) \ No newline at end of file diff --git a/src/scripts/dats_to_doi/upload_dats_to_zenodo.py b/src/scripts/dats_to_doi/upload_dats_to_zenodo.py index 707819ea..8e9bf0c7 100644 --- a/src/scripts/dats_to_doi/upload_dats_to_zenodo.py +++ b/src/scripts/dats_to_doi/upload_dats_to_zenodo.py @@ -9,139 +9,144 @@ import os from os import system from pathlib import Path - -ACCESS_TOKEN = 'SvxcV0O7kHohjkBVHcHZ3iZmgtJvKeZPN85ZFtgrc5wa0Uup1MtYWl2HzWTw' -dats_folder = '/Users/amd176/Documents/Repositories/digital-commons/src/scripts/convert_to_dats/output/spew_us_dats_json' - -headers = {"Content-Type": "application/json"} - -# iterate over every file in this directory -for filename in os.listdir(dats_folder): - if filename.endswith(".json"): - # Read metadata as json - with open(os.path.join(dats_folder, filename)) as json_file: - json_data = json.load(json_file); - - # Get url from json metadata - url_identifier = (json_data['identifier']['identifier']) - - # If apollo library entry - if url_identifier: - # Parse url to xml to string to file - xml = XML.parse(URLLIB.urlopen(url_identifier)) - xml_string = xml.toprettyxml() - xml_filename = 'data.xml' - xml_file = open(xml_filename, "w") - xml_file.writelines(xml_string) - xml_file.close() - - # Parse URL to json to string to file - with URLLIB.urlopen(url_identifier.replace("xml", "json")) as url: - data_as_json = json.loads(url.read().decode()) - json_filename = 'data.json' - json_file = open(json_filename, "w") - json_file.writelines(json.dumps(data_as_json)) - json_file.close() - - # If spew entry - else: - access_url = json_data['distributions'][0]['access']['accessURL'] - identifier = access_url.split('/')[-2] - - # Check if spew data already exists, otherwise download it - spew_output_file = Path(dats_folder + identifier + ".tar.gz") - if not spew_output_file.is_file(): - hrefs = access_url.split('edu')[1] - post_request = access_url.split(identifier)[0] - system( - 'curl -X POST -F "action=download" -F "as=' + identifier + '.tar" -F "type=php-tar" -F "hrefs=' + hrefs + '" ' + post_request + ' | gzip -vc > ' + dats_folder + identifier + '.tar.gz') - - # Create empty upload - r = requests.post('https://zenodo.org/api/deposit/depositions', - params={'access_token': ACCESS_TOKEN}, json={}, - headers=headers) - - # get deposition id from previous response - deposition_id = r.json()['id'] - - # add metadata - creators = [] - for name_index in range(len(json_data['creators'])): - creators.append({'name': json_data['creators'][name_index]['lastName'] + ', ' + json_data['creators'][name_index]['firstName']},) - - # funders = [] - # for grant_index in range(len(json_data['acknowledges'])): - # funders.append({'id' : json_data['acknowledges'][grant_index]['identifier']['identifier']},) - - data = { - "metadata": { - "title": json_data['title'], - "upload_type": "dataset", - "creators": - creators, - "description": json_data['description'], - "access_right": "open", - # "grants": - # funders, - - # license for Creative Commons Attribution 4.0 - "license": { - "domain_content": "true", - "domain_data": "true", - "domain_software": "false", - "family": "", - "id": "CC-BY-4.0", - "od_conformance": "approved", - "osd_conformance": "not reviewed", - "maintainer": "Creative Commons", - "status": "active", - "title": "Creative Commons Attribution 4.0", - "url": "https://creativecommons.org/licenses/by/4.0/" +import sys + +if len(sys.argv) < 3: + print('DESCRIPTION:\n\tGenerate Zenodo drafts for DATS datasets\n\tA Zenodo access token and directory path are required\n') + print('USAGE:\n\tpython upload_dats_to_zenodo.py \n') +else: + ACCESS_TOKEN = sys.argv[1] + dats_folder = sys.argv[2] + + headers = {"Content-Type": "application/json"} + + # iterate over every file in this directory + for filename in os.listdir(dats_folder): + if filename.endswith(".json"): + # Read metadata as json + with open(os.path.join(dats_folder, filename)) as json_file: + json_data = json.load(json_file); + + # Get url from json metadata + url_identifier = (json_data['identifier']['identifier']) + + # If apollo library entry + if url_identifier: + # Parse url to xml to string to file + xml = XML.parse(URLLIB.urlopen(url_identifier)) + xml_string = xml.toprettyxml() + xml_filename = 'data.xml' + xml_file = open(xml_filename, "w") + xml_file.writelines(xml_string) + xml_file.close() + + # Parse URL to json to string to file + with URLLIB.urlopen(url_identifier.replace("xml", "json")) as url: + data_as_json = json.loads(url.read().decode()) + json_filename = 'data.json' + json_file = open(json_filename, "w") + json_file.writelines(json.dumps(data_as_json)) + json_file.close() + + # If spew entry + else: + access_url = json_data['distributions'][0]['access']['accessURL'] + identifier = access_url.split('/')[-2] + + # Check if spew data already exists, otherwise download it + spew_output_file = Path(dats_folder + identifier + ".tar.gz") + if not spew_output_file.is_file(): + hrefs = access_url.split('edu')[1] + post_request = access_url.split(identifier)[0] + system( + 'curl -X POST -F "action=download" -F "as=' + identifier + '.tar" -F "type=php-tar" -F "hrefs=' + hrefs + '" ' + post_request + ' | gzip -vc > ' + dats_folder + identifier + '.tar.gz') + + # Create empty upload + r = requests.post('https://zenodo.org/api/deposit/depositions', + params={'access_token': ACCESS_TOKEN}, json={}, + headers=headers) + + # get deposition id from previous response + deposition_id = r.json()['id'] + + # add metadata + creators = [] + for name_index in range(len(json_data['creators'])): + creators.append({'name': json_data['creators'][name_index]['lastName'] + ', ' + json_data['creators'][name_index]['firstName']},) + + # funders = [] + # for grant_index in range(len(json_data['acknowledges'])): + # funders.append({'id' : json_data['acknowledges'][grant_index]['identifier']['identifier']},) + + data = { + "metadata": { + "title": json_data['title'], + "upload_type": "dataset", + "creators": + creators, + "description": json_data['description'], + "access_right": "open", + # "grants": + # funders, + + # license for Creative Commons Attribution 4.0 + "license": { + "domain_content": "true", + "domain_data": "true", + "domain_software": "false", + "family": "", + "id": "CC-BY-4.0", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "maintainer": "Creative Commons", + "status": "active", + "title": "Creative Commons Attribution 4.0", + "url": "https://creativecommons.org/licenses/by/4.0/" + } } } - } - r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % deposition_id, - params={'access_token': ACCESS_TOKEN}, data=json.dumps(data), - headers=headers) - - #upload files for library viewer - if url_identifier: - # upload new file (xml) - data = {'filename': json_data['title']+'.xml'} - files = {'file': open(xml_filename, "rb")} - r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id, - params={'access_token': ACCESS_TOKEN}, data=data, - files=files) - - # upload new file (json) - data = {'filename': json_data['title']+'.json'} - files = {'file' : open(json_filename,"rb")} - r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id, - params={'access_token': ACCESS_TOKEN}, data=data, - files=files) - - print(r.status_code) - print(r.json()) - - # delete temp files - os.remove(xml_filename) - os.remove(json_filename) + r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % deposition_id, + params={'access_token': ACCESS_TOKEN}, data=json.dumps(data), + headers=headers) + + #upload files for library viewer + if url_identifier: + # upload new file (xml) + data = {'filename': json_data['title']+'.xml'} + files = {'file': open(xml_filename, "rb")} + r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id, + params={'access_token': ACCESS_TOKEN}, data=data, + files=files) - else: - data = {'filename' : 'output.tar.gz'} - files = {'file' : open(dats_folder + identifier + ".tar.gz", "rb")} - file_size = os.path.getsize(dats_folder + identifier + ".tar.gz") - if file_size < 1e+8: + # upload new file (json) + data = {'filename': json_data['title']+'.json'} + files = {'file' : open(json_filename,"rb")} r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id, - params={'access_token': ACCESS_TOKEN}, data=data, - files=files) + params={'access_token': ACCESS_TOKEN}, data=data, + files=files) - print(spew_output_file) + print(r.status_code) + print(r.json()) - print(r.status_code) - print(r.json()) + # delete temp files + os.remove(xml_filename) + os.remove(json_filename) + else: + data = {'filename' : 'output.tar.gz'} + files = {'file' : open(dats_folder + identifier + ".tar.gz", "rb")} + file_size = os.path.getsize(dats_folder + identifier + ".tar.gz") + if file_size < 1e+8: + r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id, + params={'access_token': ACCESS_TOKEN}, data=data, + files=files) - continue - else: - continue + print(spew_output_file) + + print(r.status_code) + print(r.json()) + + + continue + else: + continue