From 9219a44f8bfbe99f52da13104006205088acc3b0 Mon Sep 17 00:00:00 2001
From: mas400 <mas400@mas400-mdt.dbmi.pitt.edu>
Date: Wed, 28 Jun 2017 16:34:01 -0400
Subject: [PATCH] updated scripts to use command line arguments

---
 .../dats_to_doi/create_spew_mapping.py        |  59 ++--
 .../dats_to_doi/update_dats_with_doi.py       | 109 ++++----
 .../dats_to_doi/upload_dats_to_zenodo.py      | 261 +++++++++---------
 3 files changed, 223 insertions(+), 206 deletions(-)

diff --git a/src/scripts/dats_to_doi/create_spew_mapping.py b/src/scripts/dats_to_doi/create_spew_mapping.py
index e840f596..106d70a3 100644
--- a/src/scripts/dats_to_doi/create_spew_mapping.py
+++ b/src/scripts/dats_to_doi/create_spew_mapping.py
@@ -4,40 +4,47 @@
 import json
 import csv
 import re
+import sys
 
-dats_folder = 'DATS FOLDER LOCATION'
-if os.path.isfile('spew_mapping.csv'):
-    file = open('spew_mapping.csv', 'a+')
-    interval = sum(1 for line in open('spew_mapping.csv'))-1
-    wr = csv.writer(file, quoting=csv.QUOTE_ALL)
 
+if len(sys.argv) < 2:
+    print('DESCRIPTION:\n\tReads SPEW metadata and creates a CSV mapping of the location code, SPEW version, and landing page URL to an anonymous identifier\n\tA directory path is required\n')
+    print('USAGE:\n\tpython create_spew_mapping.py <path_to_dats_directory>\n')
 else:
-    interval = 0
-    file = open('spew_mapping.csv', 'w')
-    wr = csv.writer(file, quoting=csv.QUOTE_ALL)
-    wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title'])
+    dats_folder = sys.argv[1]
 
-for filename in os.listdir(dats_folder):
-    if filename.endswith(".json"):
-        interval += 1
+    if os.path.isfile('spew_mapping.csv'):
+        file = open('spew_mapping.csv', 'a+')
+        interval = sum(1 for line in open('spew_mapping.csv'))-1
+        wr = csv.writer(file, quoting=csv.QUOTE_ALL)
 
-        # Read metadata as json
-        with open(os.path.join(dats_folder, filename)) as json_file:
-            json_data = json.load(json_file);
+    else:
+        interval = 0
+        file = open('spew_mapping.csv', 'w')
+        wr = csv.writer(file, quoting=csv.QUOTE_ALL)
+        wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title'])
 
-        # Get title
-        title = json_data['title']
+    for filename in os.listdir(dats_folder):
+        if filename.endswith(".json"):
+            interval += 1
 
-        # Get landing page
-        landing_page = json_data['distributions'][0]['access']['landingPage']
+            # Read metadata as json
+            with open(os.path.join(dats_folder, filename)) as json_file:
+                json_data = json.load(json_file);
 
-        # Get apollo location code
-        ls_url = json_data['spatialCoverage'][0]['identifier']['identifier']
-        location_code = int(re.search(r'\d+', ls_url).group())
+            # Get title
+            title = json_data['title']
 
-        # Get spew version
-        version = json_data['types'][2]['platform']['value']
+            # Get landing page
+            landing_page = json_data['distributions'][0]['access']['landingPage']
 
-        wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title])
+            # Get apollo location code
+            ls_url = json_data['spatialCoverage'][0]['identifier']['identifier']
+            location_code = int(re.search(r'\d+', ls_url).group())
 
-file.close()
\ No newline at end of file
+            # Get spew version
+            version = json_data['types'][2]['platform']['value']
+
+            wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title])
+
+    file.close()
\ No newline at end of file
diff --git a/src/scripts/dats_to_doi/update_dats_with_doi.py b/src/scripts/dats_to_doi/update_dats_with_doi.py
index 77f43fa0..e0ea6896 100644
--- a/src/scripts/dats_to_doi/update_dats_with_doi.py
+++ b/src/scripts/dats_to_doi/update_dats_with_doi.py
@@ -2,68 +2,73 @@
 import csv
 import json
 import collections
+import sys
 
-ACCESS_TOKEN = 'SvxcV0O7kHohjkBVHcHZ3iZmgtJvKeZPN85ZFtgrc5wa0Uup1MtYWl2HzWTw'
-dats_folder = '/Users/amd176/Documents/Repositories/digital-commons/src/scripts/convert_to_dats/output/spew_ipums_dats_json/'
+if len(sys.argv) < 3:
+    print('DESCRIPTION:\n\tUpdate the DATS metadata with the DOI and PURL\n\tA Zenodo access token and directory path are required\n')
+    print('USAGE:\n\tpython update_dats_with_doi.py <access_token> <path_to_dats_directory>\n')
+else:
+    ACCESS_TOKEN = sys.argv[1]
+    dats_folder = sys.argv[2]
 
-data = csv.reader(open('spew_mapping.csv'))
-# Read the column names from the first line of the file
-fields = next(data)
-csv_dict = {}
-for row in data:
-    # Zip together the field names and values
-    items = zip(fields, row)
-    item = {}
-    key = ()
-    # Add the value to our dictionary
-    for (name, value) in items:
-        item[name] = value.strip()
+    data = csv.reader(open('spew_mapping.csv'))
+    # Read the column names from the first line of the file
+    fields = next(data)
+    csv_dict = {}
+    for row in data:
+        # Zip together the field names and values
+        items = zip(fields, row)
+        item = {}
+        key = ()
+        # Add the value to our dictionary
+        for (name, value) in items:
+            item[name] = value.strip()
 
-    key = item['Title']
-    csv_dict[key] = item
+        key = item['Title']
+        csv_dict[key] = item
 
-response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'})
-json_response = response.json()
-for deposition_index in range(len(json_response)):
-    id = json_response[deposition_index]['id']
+    response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'})
+    json_response = response.json()
+    for deposition_index in range(len(json_response)):
+        id = json_response[deposition_index]['id']
 
-    r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id),
-                    params={'access_token': ACCESS_TOKEN})
+        r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id),
+                        params={'access_token': ACCESS_TOKEN})
 
-    deposition_json = r.json()
+        deposition_json = r.json()
 
-    # Get download link for access url
-    access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename']
+        # Get download link for access url
+        access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename']
 
-    # Get title to cross reference with spew_mapping.csv
-    title = deposition_json['title']
-    if not "RABIES" in title.upper() and not "H1N1" in title:
-        try:
-            landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier']
-        except KeyError:
-            continue
+        # Get title to cross reference with spew_mapping.csv
+        title = deposition_json['title']
+        if not "RABIES" in title.upper() and not "H1N1" in title:
+            try:
+                landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier']
+            except KeyError:
+                continue
 
-        # Extract the name  from the landing page in spew_mapping, this will allow us to access the json file
-        file_name = ()
-        old_landing_page = csv_dict[title]['Landing Page'].split('/')
-        if len(old_landing_page) > 10:
-            file_name = old_landing_page[8] + ".json"
-        else:
-            file_name = old_landing_page[7] + ".json"
+            # Extract the name  from the landing page in spew_mapping, this will allow us to access the json file
+            file_name = ()
+            old_landing_page = csv_dict[title]['Landing Page'].split('/')
+            if len(old_landing_page) > 10:
+                file_name = old_landing_page[8] + ".json"
+            else:
+                file_name = old_landing_page[7] + ".json"
 
-        # Update the dats file with the correct identifier information and the access and landing URLs
-        try:
-            with open(dats_folder+file_name) as json_file:
-                old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict)
-        except FileNotFoundError:
-            continue
+            # Update the dats file with the correct identifier information and the access and landing URLs
+            try:
+                with open(dats_folder+file_name) as json_file:
+                    old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict)
+            except FileNotFoundError:
+                continue
 
 
-        old_meta_data['identifier']['identifier'] = deposition_json['doi_url']
-        old_meta_data['identifier']['identifierSource'] = "zenodo"
-        old_meta_data['distributions'][0]['access']['accessURL'] = access_url
-        old_meta_data['distributions'][0]['access']['landingPage'] = landing_url
+            old_meta_data['identifier']['identifier'] = deposition_json['doi_url']
+            old_meta_data['identifier']['identifierSource'] = "zenodo"
+            old_meta_data['distributions'][0]['access']['accessURL'] = access_url
+            old_meta_data['distributions'][0]['access']['landingPage'] = landing_url
 
-        with open(dats_folder+file_name, 'w') as outfile:
-            json.dump(old_meta_data, outfile, indent=4)
-        print("created " + file_name)
\ No newline at end of file
+            with open(dats_folder+file_name, 'w') as outfile:
+                json.dump(old_meta_data, outfile, indent=4)
+            print("created " + file_name)
\ No newline at end of file
diff --git a/src/scripts/dats_to_doi/upload_dats_to_zenodo.py b/src/scripts/dats_to_doi/upload_dats_to_zenodo.py
index 707819ea..8e9bf0c7 100644
--- a/src/scripts/dats_to_doi/upload_dats_to_zenodo.py
+++ b/src/scripts/dats_to_doi/upload_dats_to_zenodo.py
@@ -9,139 +9,144 @@
 import os
 from os import system
 from pathlib import Path
-
-ACCESS_TOKEN = 'SvxcV0O7kHohjkBVHcHZ3iZmgtJvKeZPN85ZFtgrc5wa0Uup1MtYWl2HzWTw'
-dats_folder = '/Users/amd176/Documents/Repositories/digital-commons/src/scripts/convert_to_dats/output/spew_us_dats_json'
-
-headers = {"Content-Type": "application/json"}
-
-# iterate over every file in this directory
-for filename in os.listdir(dats_folder):
-    if filename.endswith(".json"):
-        # Read metadata as json
-        with open(os.path.join(dats_folder, filename)) as json_file:
-            json_data = json.load(json_file);
-
-        # Get url from json metadata
-        url_identifier = (json_data['identifier']['identifier'])
-
-        # If apollo library entry
-        if  url_identifier:
-            # Parse url to xml to string to file
-            xml = XML.parse(URLLIB.urlopen(url_identifier))
-            xml_string = xml.toprettyxml()
-            xml_filename = 'data.xml'
-            xml_file = open(xml_filename, "w")
-            xml_file.writelines(xml_string)
-            xml_file.close()
-
-            # Parse URL to json to string to file
-            with URLLIB.urlopen(url_identifier.replace("xml", "json")) as url:
-                data_as_json = json.loads(url.read().decode())
-            json_filename = 'data.json'
-            json_file = open(json_filename, "w")
-            json_file.writelines(json.dumps(data_as_json))
-            json_file.close()
-
-        # If spew entry
-        else:
-            access_url = json_data['distributions'][0]['access']['accessURL']
-            identifier = access_url.split('/')[-2]
-
-            # Check if spew data already exists, otherwise download it
-            spew_output_file = Path(dats_folder + identifier + ".tar.gz")
-            if not spew_output_file.is_file():
-                hrefs = access_url.split('edu')[1]
-                post_request = access_url.split(identifier)[0]
-                system(
-                    'curl -X POST -F "action=download" -F "as=' + identifier + '.tar" -F "type=php-tar" -F "hrefs=' + hrefs + '" ' + post_request + ' | gzip  -vc > ' + dats_folder + identifier + '.tar.gz')
-
-        # Create empty upload
-        r = requests.post('https://zenodo.org/api/deposit/depositions',
-                          params={'access_token': ACCESS_TOKEN}, json={},
-                          headers=headers)
-
-        # get deposition id from previous response
-        deposition_id = r.json()['id']
-
-        # add metadata
-        creators = []
-        for name_index in range(len(json_data['creators'])):
-            creators.append({'name': json_data['creators'][name_index]['lastName'] + ', ' + json_data['creators'][name_index]['firstName']},)
-
-        # funders = []
-        # for grant_index in range(len(json_data['acknowledges'])):
-        #     funders.append({'id' : json_data['acknowledges'][grant_index]['identifier']['identifier']},)
-
-        data = {
-            "metadata": {
-                "title": json_data['title'],
-                "upload_type": "dataset",
-                "creators":
-                    creators,
-                "description": json_data['description'],
-                "access_right": "open",
-                # "grants":
-                #     funders,
-
-                # license for Creative Commons Attribution 4.0
-                "license": {
-                    "domain_content": "true",
-                    "domain_data": "true",
-                    "domain_software": "false",
-                    "family": "",
-                    "id": "CC-BY-4.0",
-                    "od_conformance": "approved",
-                    "osd_conformance": "not reviewed",
-                    "maintainer": "Creative Commons",
-                    "status": "active",
-                    "title": "Creative Commons Attribution 4.0",
-                    "url": "https://creativecommons.org/licenses/by/4.0/"
+import sys
+
+if len(sys.argv) < 3:
+    print('DESCRIPTION:\n\tGenerate Zenodo drafts for DATS datasets\n\tA Zenodo access token and directory path are required\n')
+    print('USAGE:\n\tpython upload_dats_to_zenodo.py <access_token> <path_to_dats_directory>\n')
+else:
+    ACCESS_TOKEN = sys.argv[1]
+    dats_folder = sys.argv[2]
+
+    headers = {"Content-Type": "application/json"}
+
+    # iterate over every file in this directory
+    for filename in os.listdir(dats_folder):
+        if filename.endswith(".json"):
+            # Read metadata as json
+            with open(os.path.join(dats_folder, filename)) as json_file:
+                json_data = json.load(json_file);
+
+            # Get url from json metadata
+            url_identifier = (json_data['identifier']['identifier'])
+
+            # If apollo library entry
+            if  url_identifier:
+                # Parse url to xml to string to file
+                xml = XML.parse(URLLIB.urlopen(url_identifier))
+                xml_string = xml.toprettyxml()
+                xml_filename = 'data.xml'
+                xml_file = open(xml_filename, "w")
+                xml_file.writelines(xml_string)
+                xml_file.close()
+
+                # Parse URL to json to string to file
+                with URLLIB.urlopen(url_identifier.replace("xml", "json")) as url:
+                    data_as_json = json.loads(url.read().decode())
+                json_filename = 'data.json'
+                json_file = open(json_filename, "w")
+                json_file.writelines(json.dumps(data_as_json))
+                json_file.close()
+
+            # If spew entry
+            else:
+                access_url = json_data['distributions'][0]['access']['accessURL']
+                identifier = access_url.split('/')[-2]
+
+                # Check if spew data already exists, otherwise download it
+                spew_output_file = Path(dats_folder + identifier + ".tar.gz")
+                if not spew_output_file.is_file():
+                    hrefs = access_url.split('edu')[1]
+                    post_request = access_url.split(identifier)[0]
+                    system(
+                        'curl -X POST -F "action=download" -F "as=' + identifier + '.tar" -F "type=php-tar" -F "hrefs=' + hrefs + '" ' + post_request + ' | gzip  -vc > ' + dats_folder + identifier + '.tar.gz')
+
+            # Create empty upload
+            r = requests.post('https://zenodo.org/api/deposit/depositions',
+                              params={'access_token': ACCESS_TOKEN}, json={},
+                              headers=headers)
+
+            # get deposition id from previous response
+            deposition_id = r.json()['id']
+
+            # add metadata
+            creators = []
+            for name_index in range(len(json_data['creators'])):
+                creators.append({'name': json_data['creators'][name_index]['lastName'] + ', ' + json_data['creators'][name_index]['firstName']},)
+
+            # funders = []
+            # for grant_index in range(len(json_data['acknowledges'])):
+            #     funders.append({'id' : json_data['acknowledges'][grant_index]['identifier']['identifier']},)
+
+            data = {
+                "metadata": {
+                    "title": json_data['title'],
+                    "upload_type": "dataset",
+                    "creators":
+                        creators,
+                    "description": json_data['description'],
+                    "access_right": "open",
+                    # "grants":
+                    #     funders,
+
+                    # license for Creative Commons Attribution 4.0
+                    "license": {
+                        "domain_content": "true",
+                        "domain_data": "true",
+                        "domain_software": "false",
+                        "family": "",
+                        "id": "CC-BY-4.0",
+                        "od_conformance": "approved",
+                        "osd_conformance": "not reviewed",
+                        "maintainer": "Creative Commons",
+                        "status": "active",
+                        "title": "Creative Commons Attribution 4.0",
+                        "url": "https://creativecommons.org/licenses/by/4.0/"
+                    }
                 }
             }
-        }
-        r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % deposition_id,
-                         params={'access_token': ACCESS_TOKEN}, data=json.dumps(data),
-                         headers=headers)
-
-         #upload files for library viewer
-        if url_identifier:
-            # upload new file (xml)
-            data = {'filename': json_data['title']+'.xml'}
-            files = {'file': open(xml_filename, "rb")}
-            r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id,
-                              params={'access_token': ACCESS_TOKEN}, data=data,
-                              files=files)
-
-            # upload new file (json)
-            data = {'filename': json_data['title']+'.json'}
-            files = {'file' : open(json_filename,"rb")}
-            r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id,
-                              params={'access_token': ACCESS_TOKEN}, data=data,
-                              files=files)
-
-            print(r.status_code)
-            print(r.json())
-
-            # delete temp files
-            os.remove(xml_filename)
-            os.remove(json_filename)
+            r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % deposition_id,
+                             params={'access_token': ACCESS_TOKEN}, data=json.dumps(data),
+                             headers=headers)
+
+            #upload files for library viewer
+            if url_identifier:
+                # upload new file (xml)
+                data = {'filename': json_data['title']+'.xml'}
+                files = {'file': open(xml_filename, "rb")}
+                r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id,
+                                  params={'access_token': ACCESS_TOKEN}, data=data,
+                                  files=files)
 
-        else:
-            data = {'filename' : 'output.tar.gz'}
-            files = {'file' : open(dats_folder + identifier + ".tar.gz", "rb")}
-            file_size = os.path.getsize(dats_folder + identifier + ".tar.gz")
-            if file_size < 1e+8:
+                # upload new file (json)
+                data = {'filename': json_data['title']+'.json'}
+                files = {'file' : open(json_filename,"rb")}
                 r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id,
-                              params={'access_token': ACCESS_TOKEN}, data=data,
-                              files=files)
+                                  params={'access_token': ACCESS_TOKEN}, data=data,
+                                  files=files)
 
-            print(spew_output_file)
+                print(r.status_code)
+                print(r.json())
 
-            print(r.status_code)
-            print(r.json())
+                # delete temp files
+                os.remove(xml_filename)
+                os.remove(json_filename)
 
+            else:
+                data = {'filename' : 'output.tar.gz'}
+                files = {'file' : open(dats_folder + identifier + ".tar.gz", "rb")}
+                file_size = os.path.getsize(dats_folder + identifier + ".tar.gz")
+                if file_size < 1e+8:
+                    r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id,
+                                      params={'access_token': ACCESS_TOKEN}, data=data,
+                                      files=files)
 
-        continue
-    else:
-        continue
+                print(spew_output_file)
+
+                print(r.status_code)
+                print(r.json())
+
+
+            continue
+        else:
+            continue