From 37ac14b5903418fd0317b560dff7a43e441a4a4b Mon Sep 17 00:00:00 2001 From: KI JONG BYUN Date: Sun, 6 Oct 2024 11:42:36 -0500 Subject: [PATCH 1/7] Make the Updater code dry --- src/updater/__init__.py | 53 +++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/src/updater/__init__.py b/src/updater/__init__.py index 6447e42..17bb86a 100644 --- a/src/updater/__init__.py +++ b/src/updater/__init__.py @@ -8,33 +8,28 @@ def __init__(self, county): self.county = county.lower() def update(self): + #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? + load_dotenv() + URL = os.getenv("URL") + KEY = os.getenv("KEY") + DATA_BASE_NAME = os.getenv("DATA_BASE_NAME") + CONTAINER_NAME_CLEANED = os.getenv("CONTAINER_NAME_CLEANED") + client = CosmosClient(URL, credential=KEY) + database = client.get_database_client(DATA_BASE_NAME) + COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) case_json_cleaned_folder_path = os.path.join( os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned" ) - list_case_json_files = os.listdir(case_json_cleaned_folder_path) - limiter = 0 - #Loops through all of the cleaned and redacted JSON files (the final versions) + for case_json in list_case_json_files: - limiter +=1 - if limiter == 5: - break print(case_json) - # Opens the JSON file and reads it to a dictionary. - in_file = case_json_cleaned_folder_path + "\\" + case_json + in_file = case_json_cleaned_folder_path + "/" + case_json with open(in_file, "r") as f: input_dict = json.load(f) - print(input_dict) - #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? - load_dotenv() - URL = os.getenv("URL") - KEY = os.getenv("KEY") - DATA_BASE_NAME = os.getenv("DATA_BASE_NAME") - CONTAINER_NAME_CLEANED = os.getenv("CONTAINER_NAME_CLEANED") - client = CosmosClient(URL, credential=KEY) - database = client.get_database_client(DATA_BASE_NAME) - COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) + # print(input_dict) + # Querying case databse to fetch all items that match the hash. hash_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['html_hash'] = '{input_dict['html_hash']}'" try: @@ -42,10 +37,11 @@ def update(self): cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True)) except Exception as e: print(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") - if len(cases) >0: + if len(cases) > 0: #There already exists one with the same hash, so skip this entirely. print(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") continue + # Querying case databse to fetch all items that match the cause number. case_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['case_number'] = '{input_dict['case_number']}'" try: @@ -54,17 +50,16 @@ def update(self): except Exception as e: print(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") #If there are no cases that match the cause number, then create the case ID, add a version number of 1 to the JSON and push the JSON to the database. + today = dt.today() + input_dict['id'] = input_dict['case_number'] + ":" + input_dict['county'] + ":" + today.strftime('%m-%d-%Y') + input_dict['html_hash'] + input_dict['version'] = max(int(case['version']) for case in cases) + 1 if len(cases) > 0 else 1 + # bkj: if updater is run more than once a day on the same county data, error will occur due to identical id. + if len(cases) == 0: print(f"No cases with this cause number exist in the databse: {case_json}. Pushing to database with version number 1.") - today = dt.today() - input_dict['id'] = input_dict['case_number'] + ":" + input_dict['county'] + ":" + today.strftime('%m-%d-%Y') + input_dict['html_hash'] - input_dict['version'] = 1 - COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) - if len(cases) > 0: + else: print(f"Cause numbers exist in the database but none with the same hash: {case_json}. Pushing to database with next version number.") - today = dt.today() - input_dict['id'] = input_dict['case_number'] + ":" + input_dict['county'] + ":" + today.strftime('%m-%d-%Y') + input_dict['html_hash'] - next_version = max(int(case['version']) for case in cases) + 1 - input_dict['version'] = next_version - COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) + COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) +if __name__ == '__main__': + Updater('Hays').update() \ No newline at end of file From 4d13a4837f42adba0c5b9ddd7b47c83f74539d92 Mon Sep 17 00:00:00 2001 From: KI JONG BYUN Date: Sun, 6 Oct 2024 11:44:53 -0500 Subject: [PATCH 2/7] Add logger initialization code --- src/updater/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/updater/__init__.py b/src/updater/__init__.py index 17bb86a..c3da41c 100644 --- a/src/updater/__init__.py +++ b/src/updater/__init__.py @@ -2,12 +2,32 @@ from azure.cosmos import CosmosClient, exceptions from dotenv import load_dotenv from datetime import datetime as dt +import logging class Updater(): def __init__(self, county): self.county = county.lower() + def configure_logger(self): + # configure the logger + logger = logging.getLogger(name="pid: " + str(os.getpid())) + logging.basicConfig() + logging.root.setLevel(level="INFO") + + # bkj: logger test + file_handler = logging.FileHandler('logger_log.txt') + file_handler.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + logger.info("logger is ready for Updater class") + return logger + def update(self): + # bkj: logger test + logger = self.configure_logger() + #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? load_dotenv() URL = os.getenv("URL") From 7d80c614d7619114db9c76bf42c18caabe1c70a3 Mon Sep 17 00:00:00 2001 From: KI JONG BYUN Date: Sat, 12 Oct 2024 23:17:01 -0500 Subject: [PATCH 3/7] Add more error handling and logger messages --- src/updater/__init__.py | 43 +++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/src/updater/__init__.py b/src/updater/__init__.py index c3da41c..27113bf 100644 --- a/src/updater/__init__.py +++ b/src/updater/__init__.py @@ -21,7 +21,6 @@ def configure_logger(self): file_handler.setFormatter(formatter) logger.addHandler(file_handler) - logger.info("logger is ready for Updater class") return logger def update(self): @@ -34,9 +33,21 @@ def update(self): KEY = os.getenv("KEY") DATA_BASE_NAME = os.getenv("DATA_BASE_NAME") CONTAINER_NAME_CLEANED = os.getenv("CONTAINER_NAME_CLEANED") - client = CosmosClient(URL, credential=KEY) - database = client.get_database_client(DATA_BASE_NAME) - COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) + try: + client = CosmosClient(URL, credential=KEY) + except Exception as e: + logger.error(f"Error instantiating CosmosClient: {e.status_code} - {e.message}") + return + try: + database = client.get_database_client(DATA_BASE_NAME) + except Exception as e: + logger.error(f"Error instantiating DatabaseClient: {e.status_code} - {e.message}") + return + try: + COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) + except Exception as e: + logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}") + return case_json_cleaned_folder_path = os.path.join( os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned" @@ -49,6 +60,7 @@ def update(self): with open(in_file, "r") as f: input_dict = json.load(f) # print(input_dict) + logger.info(f"[Case Filename: {case_json}, Case Number: {input_dict.get('case_number', None)}, HTML Hash: {input_dict.get('html_hash', None)}]") # Querying case databse to fetch all items that match the hash. hash_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['html_hash'] = '{input_dict['html_hash']}'" @@ -57,9 +69,14 @@ def update(self): cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True)) except Exception as e: print(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") + logger.error(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") + # bkj - I think we may need to move on to a next case when this exception occurs. + continue + if len(cases) > 0: #There already exists one with the same hash, so skip this entirely. print(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") + logger.info(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") continue # Querying case databse to fetch all items that match the cause number. @@ -69,17 +86,23 @@ def update(self): cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=case_query,enable_cross_partition_query=True)) except Exception as e: print(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") + logger.error(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") + # bkj - I think we may need to move on to a next case when this exception occurs. + continue + #If there are no cases that match the cause number, then create the case ID, add a version number of 1 to the JSON and push the JSON to the database. today = dt.today() input_dict['id'] = input_dict['case_number'] + ":" + input_dict['county'] + ":" + today.strftime('%m-%d-%Y') + input_dict['html_hash'] input_dict['version'] = max(int(case['version']) for case in cases) + 1 if len(cases) > 0 else 1 - # bkj: if updater is run more than once a day on the same county data, error will occur due to identical id. + try: + COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) + except Exception as e: + logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}") + # bkj - I think we may need to move on to a next case when this exception occurs. + # bkj: if updater is run more than once a day on the same county data, error will occur due to identical id. + continue - if len(cases) == 0: - print(f"No cases with this cause number exist in the databse: {case_json}. Pushing to database with version number 1.") - else: - print(f"Cause numbers exist in the database but none with the same hash: {case_json}. Pushing to database with next version number.") - COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) + logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}") if __name__ == '__main__': Updater('Hays').update() \ No newline at end of file From be323b3967e6804741bbfdac5b986b130d0b0ff4 Mon Sep 17 00:00:00 2001 From: KI JONG BYUN Date: Sun, 13 Oct 2024 14:06:24 -0500 Subject: [PATCH 4/7] Remove logs printed to the screen --- src/updater/__init__.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/updater/__init__.py b/src/updater/__init__.py index 27113bf..be6da5a 100644 --- a/src/updater/__init__.py +++ b/src/updater/__init__.py @@ -9,12 +9,10 @@ def __init__(self, county): self.county = county.lower() def configure_logger(self): - # configure the logger logger = logging.getLogger(name="pid: " + str(os.getpid())) logging.basicConfig() logging.root.setLevel(level="INFO") - # bkj: logger test file_handler = logging.FileHandler('logger_log.txt') file_handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') @@ -24,7 +22,6 @@ def configure_logger(self): return logger def update(self): - # bkj: logger test logger = self.configure_logger() #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? @@ -37,16 +34,19 @@ def update(self): client = CosmosClient(URL, credential=KEY) except Exception as e: logger.error(f"Error instantiating CosmosClient: {e.status_code} - {e.message}") + # bkj - Need to save list_case_json_files(all input filenames) to a separate file. return try: database = client.get_database_client(DATA_BASE_NAME) except Exception as e: logger.error(f"Error instantiating DatabaseClient: {e.status_code} - {e.message}") + # bkj - Need to save list_case_json_files(all input filenames) to a separate file. return try: COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) except Exception as e: logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}") + # bkj - Need to save list_case_json_files(all input filenames) to a separate file. return case_json_cleaned_folder_path = os.path.join( @@ -55,11 +55,9 @@ def update(self): list_case_json_files = os.listdir(case_json_cleaned_folder_path) for case_json in list_case_json_files: - print(case_json) in_file = case_json_cleaned_folder_path + "/" + case_json with open(in_file, "r") as f: input_dict = json.load(f) - # print(input_dict) logger.info(f"[Case Filename: {case_json}, Case Number: {input_dict.get('case_number', None)}, HTML Hash: {input_dict.get('html_hash', None)}]") # Querying case databse to fetch all items that match the hash. @@ -68,14 +66,12 @@ def update(self): # Execute the query cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True)) except Exception as e: - print(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") logger.error(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") - # bkj - I think we may need to move on to a next case when this exception occurs. + # bkj - Need to save case_json(input filename) to a separate file. continue if len(cases) > 0: #There already exists one with the same hash, so skip this entirely. - print(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") logger.info(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") continue @@ -85,9 +81,8 @@ def update(self): # Execute the query cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=case_query,enable_cross_partition_query=True)) except Exception as e: - print(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") logger.error(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") - # bkj - I think we may need to move on to a next case when this exception occurs. + # bkj - Need to save case_json(input filename) to a separate file. continue #If there are no cases that match the cause number, then create the case ID, add a version number of 1 to the JSON and push the JSON to the database. @@ -98,8 +93,7 @@ def update(self): COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) except Exception as e: logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}") - # bkj - I think we may need to move on to a next case when this exception occurs. - # bkj: if updater is run more than once a day on the same county data, error will occur due to identical id. + # bkj - Need to save case_json(input filename) to a separate file. continue logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}") From 2898602d81afe9056dda83439b3f2f4d66ee2e65 Mon Sep 17 00:00:00 2001 From: KI JONG BYUN Date: Mon, 21 Oct 2024 21:14:05 -0500 Subject: [PATCH 5/7] Add all loggings to a separate file --- src/updater/__init__.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/updater/__init__.py b/src/updater/__init__.py index be6da5a..c20aa10 100644 --- a/src/updater/__init__.py +++ b/src/updater/__init__.py @@ -8,21 +8,33 @@ class Updater(): def __init__(self, county): self.county = county.lower() - def configure_logger(self): + def configure_logger(self, dir_path): logger = logging.getLogger(name="pid: " + str(os.getpid())) - logging.basicConfig() - logging.root.setLevel(level="INFO") + logger.setLevel(logging.DEBUG) - file_handler = logging.FileHandler('logger_log.txt') + file_handler = logging.FileHandler(os.path.join(dir_path, 'logger_log.txt')) file_handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.WARNING) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + return logger def update(self): - logger = self.configure_logger() + # open or create a output directory for a log and successfully processed data + processed_case_json_cleaned_foler_path = os.path.join( + os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned", f"result_{dt.today().strftime('%Y-%m-%d.%H:%M:%S.%f')}" + ) + if not os.path.exists(processed_case_json_cleaned_foler_path): + # Create the folder if it doesn't exist + os.makedirs(processed_case_json_cleaned_foler_path) + + logger = self.configure_logger(processed_case_json_cleaned_foler_path) #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? load_dotenv() @@ -34,19 +46,16 @@ def update(self): client = CosmosClient(URL, credential=KEY) except Exception as e: logger.error(f"Error instantiating CosmosClient: {e.status_code} - {e.message}") - # bkj - Need to save list_case_json_files(all input filenames) to a separate file. return try: database = client.get_database_client(DATA_BASE_NAME) except Exception as e: logger.error(f"Error instantiating DatabaseClient: {e.status_code} - {e.message}") - # bkj - Need to save list_case_json_files(all input filenames) to a separate file. return try: COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) except Exception as e: logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}") - # bkj - Need to save list_case_json_files(all input filenames) to a separate file. return case_json_cleaned_folder_path = os.path.join( @@ -56,6 +65,9 @@ def update(self): for case_json in list_case_json_files: in_file = case_json_cleaned_folder_path + "/" + case_json + if not os.path.isfile(in_file): + continue + with open(in_file, "r") as f: input_dict = json.load(f) logger.info(f"[Case Filename: {case_json}, Case Number: {input_dict.get('case_number', None)}, HTML Hash: {input_dict.get('html_hash', None)}]") @@ -67,7 +79,6 @@ def update(self): cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True)) except Exception as e: logger.error(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") - # bkj - Need to save case_json(input filename) to a separate file. continue if len(cases) > 0: @@ -82,7 +93,6 @@ def update(self): cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=case_query,enable_cross_partition_query=True)) except Exception as e: logger.error(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") - # bkj - Need to save case_json(input filename) to a separate file. continue #If there are no cases that match the cause number, then create the case ID, add a version number of 1 to the JSON and push the JSON to the database. @@ -93,7 +103,6 @@ def update(self): COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) except Exception as e: logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}") - # bkj - Need to save case_json(input filename) to a separate file. continue logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}") From 08c4adf9fa6eba0ce03e451dbb6b9a2b94fab805 Mon Sep 17 00:00:00 2001 From: KI JONG BYUN Date: Mon, 21 Oct 2024 22:24:02 -0500 Subject: [PATCH 6/7] Move successfully inserted data files to result foler --- src/updater/__init__.py | 46 +++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/src/updater/__init__.py b/src/updater/__init__.py index c20aa10..a0ef2c4 100644 --- a/src/updater/__init__.py +++ b/src/updater/__init__.py @@ -7,12 +7,22 @@ class Updater(): def __init__(self, county): self.county = county.lower() + self.case_json_cleaned_folder_path = os.path.join( + os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned" + ) + self.processed_path = os.path.join(self.case_json_cleaned_folder_path, + f"result_{dt.today().strftime('%Y-%m-%d.%H:%M:%S.%f')}") + + # open or create a output directory for a log and successfully processed data + if os.path.exists(self.case_json_cleaned_folder_path) and \ + not os.path.exists(self.processed_path): + os.makedirs(self.processed_path) - def configure_logger(self, dir_path): + def configure_logger(self): logger = logging.getLogger(name="pid: " + str(os.getpid())) logger.setLevel(logging.DEBUG) - file_handler = logging.FileHandler(os.path.join(dir_path, 'logger_log.txt')) + file_handler = logging.FileHandler(os.path.join(self.processed_path, 'logger_log.txt')) file_handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler.setFormatter(formatter) @@ -26,15 +36,11 @@ def configure_logger(self, dir_path): return logger def update(self): - # open or create a output directory for a log and successfully processed data - processed_case_json_cleaned_foler_path = os.path.join( - os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned", f"result_{dt.today().strftime('%Y-%m-%d.%H:%M:%S.%f')}" - ) - if not os.path.exists(processed_case_json_cleaned_foler_path): - # Create the folder if it doesn't exist - os.makedirs(processed_case_json_cleaned_foler_path) + logger = self.configure_logger() - logger = self.configure_logger(processed_case_json_cleaned_foler_path) + if not os.path.exists(self.case_json_cleaned_folder_path): + logger.error(f'The following path doesn\'t exits: \n{self.case_json_cleaned_folder_path}') + return #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? load_dotenv() @@ -57,15 +63,14 @@ def update(self): except Exception as e: logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}") return - - case_json_cleaned_folder_path = os.path.join( - os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned" - ) - list_case_json_files = os.listdir(case_json_cleaned_folder_path) + + list_case_json_files = os.listdir(self.case_json_cleaned_folder_path) for case_json in list_case_json_files: - in_file = case_json_cleaned_folder_path + "/" + case_json - if not os.path.isfile(in_file): + in_file = self.case_json_cleaned_folder_path + "/" + case_json + if os.path.isfile(in_file): + dest_file = self.processed_path + "/" + case_json + else: continue with open(in_file, "r") as f: @@ -82,7 +87,9 @@ def update(self): continue if len(cases) > 0: - #There already exists one with the same hash, so skip this entirely. + # There already exists one with the same hash, so skip this entirely. + # Move the file to the processed folder. + os.rename(in_file, dest_file) logger.info(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") continue @@ -105,6 +112,9 @@ def update(self): logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}") continue + # This case is inserted successfully. + # Move the file to the processed folder. + os.rename(in_file, dest_file) logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}") if __name__ == '__main__': From c4d8311667366c0a44f7c7c760e430c6f532ae53 Mon Sep 17 00:00:00 2001 From: KI JONG BYUN Date: Mon, 21 Oct 2024 22:46:57 -0500 Subject: [PATCH 7/7] Make DB container initialization as a seperate method --- src/updater/__init__.py | 47 +++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/updater/__init__.py b/src/updater/__init__.py index a0ef2c4..370b404 100644 --- a/src/updater/__init__.py +++ b/src/updater/__init__.py @@ -17,6 +17,9 @@ def __init__(self, county): if os.path.exists(self.case_json_cleaned_folder_path) and \ not os.path.exists(self.processed_path): os.makedirs(self.processed_path) + self.logger = self.configure_logger() + + self.COSMOSDB_CONTAINER_CASES_CLEANED = self.get_database_container() def configure_logger(self): logger = logging.getLogger(name="pid: " + str(os.getpid())) @@ -34,14 +37,8 @@ def configure_logger(self): logger.addHandler(console_handler) return logger - - def update(self): - logger = self.configure_logger() - - if not os.path.exists(self.case_json_cleaned_folder_path): - logger.error(f'The following path doesn\'t exits: \n{self.case_json_cleaned_folder_path}') - return - + + def get_database_container(self): #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? load_dotenv() URL = os.getenv("URL") @@ -51,19 +48,29 @@ def update(self): try: client = CosmosClient(URL, credential=KEY) except Exception as e: - logger.error(f"Error instantiating CosmosClient: {e.status_code} - {e.message}") + self.logger.error(f"Error instantiating CosmosClient: {e.status_code} - {e.message}") return try: database = client.get_database_client(DATA_BASE_NAME) except Exception as e: - logger.error(f"Error instantiating DatabaseClient: {e.status_code} - {e.message}") + self.logger.error(f"Error instantiating DatabaseClient: {e.status_code} - {e.message}") return try: COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) except Exception as e: - logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}") + self.logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}") return + return COSMOSDB_CONTAINER_CASES_CLEANED + + def update(self): + if not os.path.exists(self.case_json_cleaned_folder_path): + self.logger.error(f'The following path doesn\'t exits: \n{self.case_json_cleaned_folder_path}') + return + + if not self.COSMOSDB_CONTAINER_CASES_CLEANED: + return + list_case_json_files = os.listdir(self.case_json_cleaned_folder_path) for case_json in list_case_json_files: @@ -75,31 +82,31 @@ def update(self): with open(in_file, "r") as f: input_dict = json.load(f) - logger.info(f"[Case Filename: {case_json}, Case Number: {input_dict.get('case_number', None)}, HTML Hash: {input_dict.get('html_hash', None)}]") + self.logger.info(f"[Case Filename: {case_json}, Case Number: {input_dict.get('case_number', None)}, HTML Hash: {input_dict.get('html_hash', None)}]") # Querying case databse to fetch all items that match the hash. hash_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['html_hash'] = '{input_dict['html_hash']}'" try: # Execute the query - cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True)) + cases = list(self.COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True)) except Exception as e: - logger.error(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") + self.logger.error(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") continue if len(cases) > 0: # There already exists one with the same hash, so skip this entirely. # Move the file to the processed folder. os.rename(in_file, dest_file) - logger.info(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") + self.logger.info(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") continue # Querying case databse to fetch all items that match the cause number. case_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['case_number'] = '{input_dict['case_number']}'" try: # Execute the query - cases = list(COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=case_query,enable_cross_partition_query=True)) + cases = list(self.COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=case_query,enable_cross_partition_query=True)) except Exception as e: - logger.error(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") + self.logger.error(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") continue #If there are no cases that match the cause number, then create the case ID, add a version number of 1 to the JSON and push the JSON to the database. @@ -107,15 +114,15 @@ def update(self): input_dict['id'] = input_dict['case_number'] + ":" + input_dict['county'] + ":" + today.strftime('%m-%d-%Y') + input_dict['html_hash'] input_dict['version'] = max(int(case['version']) for case in cases) + 1 if len(cases) > 0 else 1 try: - COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) + self.COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) except Exception as e: - logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}") + self.logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}") continue # This case is inserted successfully. # Move the file to the processed folder. os.rename(in_file, dest_file) - logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}") + self.logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}") if __name__ == '__main__': Updater('Hays').update() \ No newline at end of file