From 19764ff94efb883660f57c53415b4fb32b92e7c0 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Tue, 26 Mar 2024 09:09:28 +0000 Subject: [PATCH 01/20] Removed old inputs and updated comments --- group_run.py | 152 +++++++++++++++++++++++++++------------------------ 1 file changed, 82 insertions(+), 70 deletions(-) diff --git a/group_run.py b/group_run.py index 0520606..6ab23ac 100644 --- a/group_run.py +++ b/group_run.py @@ -1,3 +1,7 @@ +__author__ = "Daniel Westwood" +__contact__ = "daniel.westwood@stfc.ac.uk" +__copyright__ = "Copyright 2023 United Kingdom Research and Innovation" + import sys import json import os @@ -5,34 +9,48 @@ import subprocess from pipeline.logs import init_logger -from pipeline.utils import BypassSwitch - -def get_group_len(workdir, group, repeat_id=1): - """Implement parallel reads from single 'group' file""" - with open(f'{workdir}/groups/{group}/proj_codes/{repeat_id}.txt') as f: - group_len = len(list(f.readlines())) - return group_len +from pipeline.utils import BypassSwitch, get_attribute, get_codes +from pipeline.allocate import create_allocation times = { - 'scan':'5:00', - 'compute':'30:00', - 'validate':'15:00' + 'scan' :'10:00', # No prediction possible prior to scanning + 'compute' :'60:00', + 'validate':'30:00' # From CMIP experiments - no reliable prediction mechanism possible } phases = list(times.keys()) -def get_attribute(env, args, var): - """Assemble environment variable or take from passed argument.""" - if os.getenv(env): - return os.getenv(env) - elif hasattr(args, var): - return getattr(args, var) +def get_group_len(workdir, group, repeat_id='main') -> int: + """ + Implement parallel reads from single 'group' file + + :param workdir: (str) The path of the current pipeline working directory. + + :param group: (str) The name of the dataset group within the pipeline. + + :param repeat_id: (int) Repeat-id subset within the group, default is main. + + :returns: (int) The number of projects within the specified subset + of a group of datasets. + """ + + # Semi-superfluous function, left in only as a doc-string reference. + codes = get_codes(group, workdir, f'proj_codes/{repeat_id}') + if codes: + return len(codes) else: - print(f'Error: Missing attribute {var}') - return None + return 0 + +def main(args) -> None: + """ + Assemble sbatch script for parallel running jobs and execute. May include + allocation of multiple tasks to each job if enabled. -def main(args,get_id=False, dependent_id=False): - """Assemble sbatch script for parallel running jobs""" + :param args: (Object) ArgParse object containing all required parameters + from default values or specific inputs from command-line. + + :returns: None + """ logger = init_logger(args.verbose, 0, 'main-group') @@ -40,77 +58,83 @@ def main(args,get_id=False, dependent_id=False): phase = args.phase group = args.groupID - WORKDIR = get_attribute('WORKDIR', args, 'workdir') - if not WORKDIR: - logger.error('WORKDIR missing or undefined') - return None - SRCDIR = get_attribute('SRCDIR', args, 'source') - if not SRCDIR: - logger.error('SRCDIR missing or undefined') - return None - VENV = get_attribute('KVENV', args, 'kvenv') - if not VENV: - logger.error('VENV missing or undefined') + if phase not in phases: + logger.error(f'"{phase}" not recognised, please select from {phases}') return None - GROUPDIR = f'{WORKDIR}/groups/{group}' - # init not parallelised + args.workdir = get_attribute('WORKDIR', args, 'workdir') + SRCDIR = get_attribute('SRCDIR', args, 'source') + VENV = get_attribute('KVENV', args, 'kvenv') + + args.groupdir = f'{args.workdir}/groups/{group}' + + logger.info(f'Starting group execution for {group}') + logger.debug('Pipeline variables:') + logger.debug(f'WORKDIR : {args.workdir}') + logger.debug(f'GROUPDIR: {args.groupdir}') + logger.debug(f'SRCDIR : {SRCDIR}') + logger.debug(f'VENVDIR : {VENV}') + + # Experimental bin-packing: Not fully implemented 25/03 + if args.binpack: + create_allocation(args) + + # Init not parallelised - run for whole group here if phase == 'init': from pipeline.init import init_config - logger.info('Running init steps as serial process') - args.groupdir = GROUPDIR - args.workdir = WORKDIR + logger.info(f'Running init steps as a serial process for {group}') args.source = SRCDIR args.venvpath = VENV init_config(args) return None # Establish some group parameters - group_len = get_group_len(WORKDIR, group, repeat_id = args.repeat_id) - group_phase_sbatch = f'{GROUPDIR}/sbatch/{phase}.sbatch' + group_len = get_group_len(args.workdir, group, repeat_id = args.repeat_id) + group_phase_sbatch = f'{args.groupdir}/sbatch/{phase}.sbatch' master_script = f'{SRCDIR}/single_run.py' template = 'extensions/templates/phase.sbatch.template' - # Make Directories - for dirx in ['sbatch','outs','errs']: - if not os.path.isdir(f'{GROUPDIR}/{dirx}'): - os.makedirs(f'{GROUPDIR}/{dirx}') - - if phase not in phases: - logger.error(f'"{phase}" not recognised, please select from {phases}') - return None + for dirx in ['sbatch','outs','errs']: # Add allocations + if not os.path.isdir(f'{args.groupdir}/{dirx}'): + os.makedirs(f'{args.groupdir}/{dirx}') + # Open sbatch template from file. with open(template) as f: sbatch = '\n'.join([r.strip() for r in f.readlines()]) + # Setup time and memory defaults time = times[phase] if args.time_allowed: time = args.time_allowed - - label = phase - if args.repeat_id: - label = args.repeat_id - mem = '2G' if args.memory: mem = args.memory + outdir = f'{args.workdir}/groups/args.groupID/outs/raw/%A_%a.out' + errdir = f'{args.workdir}/groups/{args.groupID}/errs/raw/%A_%a.out' + + os.system(f'rm -rf {outdir}/*') + os.system(f'rm -rf {errdir}/*') + sb = sbatch.format( f'{group}_{phase}_array', # Job name time, # Time mem, # Memory + outdir, + errdir, VENV, - WORKDIR, - GROUPDIR, - master_script, phase, group, time, mem + args.workdir, + args.groupdir, + master_script, phase, group, time, mem, args.repeat_id ) + + # Additional carry-through flags + sb += f' -b {args.bypass}' if args.forceful: sb += ' -f' if args.verbose: sb += ' -v' - if args.bypass != 'FDSC': - sb += f' -b {args.bypass}' if args.quality: sb += ' -Q' if args.backtrack: @@ -118,9 +142,6 @@ def main(args,get_id=False, dependent_id=False): if args.dryrun: sb += ' -d' - if 'X' in args.bypass: - logger.warning('Running with XK Shape Bypass flag "X" is experimental and should only be used with approval.') - if args.repeat_id: sb += f' -r {args.repeat_id}' @@ -132,17 +153,7 @@ def main(args,get_id=False, dependent_id=False): logger.info('DRYRUN: sbatch command: ') print(f'sbatch --array=0-{group_len-1} {group_phase_sbatch}') else: - if get_id: # Unused section to save the ID of the process - result = subprocess.run(['sbatch', f'--array=0-{group_len-1}', group_phase_sbatch], stdout=subprocess.PIPE) - try: - id = result.stdout.decode('utf-8').split(' ')[3].strip() # Check! - assert len(id) == 8 - return id - except: - logger.error('Slurm submission failed') - return None - else: - os.system(f'sbatch --array=0-{group_len-1} {group_phase_sbatch}') + os.system(f'sbatch --array=0-{group_len-1} {group_phase_sbatch}') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run a pipeline step for a group of datasets') @@ -153,6 +164,7 @@ def main(args,get_id=False, dependent_id=False): parser.add_argument('-S','--source', dest='source', help='Path to directory containing master scripts (this one)') parser.add_argument('-e','--environ',dest='venvpath', help='Path to virtual (e)nvironment (excludes /bin/activate)') parser.add_argument('-i', '--input', dest='input', help='input file (for init phase)') + parser.add_argument('-A', '--alloc-bins', dest='binpack',action='store_true', help='input file (for init phase)') # Action-based - standard flags parser.add_argument('-f','--forceful',dest='forceful',action='store_true', help='Force overwrite of steps if previously done') From a57c87c0fc1f947660e929829e1fc23f8a570e06 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Tue, 26 Mar 2024 09:24:48 +0000 Subject: [PATCH 02/20] Partial docstring update - test locally --- assess.py | 393 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 204 insertions(+), 189 deletions(-) diff --git a/assess.py b/assess.py index 2baf882..54aff0e 100644 --- a/assess.py +++ b/assess.py @@ -1,3 +1,7 @@ +__author__ = "Daniel Westwood" +__contact__ = "daniel.westwood@stfc.ac.uk" +__copyright__ = "Copyright 2023 United Kingdom Research and Innovation" + import os import argparse import glob @@ -6,10 +10,13 @@ from datetime import datetime from pipeline.logs import init_logger, log_status, FalseLogger -from pipeline.utils import get_attribute, format_str, mem_to_val, get_codes +from pipeline.utils import get_attribute, format_str, \ + mem_to_val, get_codes, set_codes, get_proj_file, \ + set_proj_file import pipeline.errors as errs -# Hints for custom errors +# Hints for custom errors - unused +""" HINTS = {} for obj in dir(errs): object = getattr(errs, obj) @@ -18,17 +25,42 @@ HINTS[inst.get_str()] = inst.message except: pass +""" + +def get_rerun_command(phase: str, ecode: str, groupID: str, repeat_id: str) -> None: + """ + Print a rerun command for inspecting a single dataset using single_run.py + + :param phase: (str) The current phase + + :param ecode: (str) The index of the project code to be inspected by running + in serial. + + :param groupID: (str) The name of the group which this project code belongs to. + + :param repeat_id: (str) The subset within the group (default is main) + + :returns: + """ + if repeat_id != 'main': + print(f'python single_run.py {phase} {ecode} -G {groupID} -r {repeat_id} -vv -d') + else: + print(f'python single_run.py {phase} {ecode} -G {groupID} -vv -d') + +def get_index_of_code(workdir: str, groupID: str, repeat_id: str, code: str) -> int: + """ + Get the index of a project code within some repeat set of codes + :param workdir: (str) The current pipeline working directory. -phases = ['scan', 'compute', 'validate', 'complete'] -checks = ['/detail-cfg.json','/*kerchunk*','/*.complete'] + :param groupID: (str) The name of the group which this project code belongs to. -def get_rerun_command(phase: str, ecode: str, groupID: str, repeat_id: str): - """Print a rerun command for inspecting a single dataset using single_run.py""" - print(f'python single_run.py {phase} {ecode} -G {groupID} -r {repeat_id} -vv -d') + :param repeat_id: (str) The subset within the group (default is main) -def get_index_of_code(workdir, groupID, repeat_id, code): - """Get the index of a project code within some repeat set of codes""" + :param code: (str) The project code for which to get the index. + + :returns pindex: (int) The index of the project code within this group-subgroup. + """ proj_codes = get_codes(groupID, workdir, f'proj_codes/{repeat_id}') pindex = 0 pcode = proj_codes[pindex] @@ -63,7 +95,7 @@ def examine_log(workdir: str, proj_code: str, phase: str, groupID=None, repeat_i paused=input('Type "E" to exit assessment: ') if paused == 'E': - sys.exit() + raise KeyboardInterrupt def merge_old_new(old_codes, new_codes, index_old='', index_new='', reason=None): """Merge an existing list of project codes with a new set @@ -98,30 +130,31 @@ def save_selection(codes: list, groupdir: str, label: str, logger, overwrite=0, Requires a groupdir (directory belonging to a group), list of codes and a label for the new file. """ + + # Annoying it seems to require force-removing 'None' values. + if not overwrite: + overwrite=0 + if len(codes) > 0: if index: codeset = '\n'.join([code[index].strip() for code in codes]) else: codeset = '\n'.join(codes) - if os.path.isfile(f'{groupdir}/proj_codes/{label}.txt'): + + write = True + if get_codes(groupdir, None, f'proj_codes/{label}'): if overwrite == 0: print(f'Skipped writing {len(codes)} to proj_codes/{label} - file exists and overwrite not set') + write = False elif overwrite == 1: print(f'Adding {len(codes)} to existing proj_codes/{label}') - with open(f'{groupdir}/proj_codes/{label}.txt') as f: - old_codes = [r.strip() for r in f.readlines()] - merged = merge_old_new(old_codes, codes) - # Need check for duplicates here - with open(f'{groupdir}/proj_codes/{label}.txt','w') as f: - f.write('\n'.join(merged)) - elif overwrite == 2: + old_codes = get_codes(groupdir, None, f'proj_codes/{label}') + codeset = '\n'.join(merge_old_new(old_codes, codes)) + elif overwrite >= 2: print(f'Overwriting with {len(codes)} in existing proj_codes/{label} file') - with open(f'{groupdir}/proj_codes/{label}.txt','w') as f: - f.write(codeset) - else: - with open(f'{groupdir}/proj_codes/{label}.txt','w') as f: - f.write(codeset) - print(f'Written {len(codes)} to proj_codes/{label}') + if write: + print(f'writing {len(codes)} to proj_codes/{label}') + set_codes(groupdir, None, f'proj_codes/{label}', codeset) else: print('No codes identified, no files written') @@ -131,7 +164,7 @@ def show_options(args, logger): List output or error directories (one per job id), or list all proj_codes text files.""" # Load the blacklist here - if args.option in ['blacklist', 'virtuals'] or 'variable' in args.option: + if args.option in ['blacklist', 'virtuals','parquet'] or 'variable' in args.option: blackset = get_codes(args.groupID, args.workdir, 'blacklist_codes') if blackset: blackcodes = {b.split(',')[0]: b.split(',')[1] for b in blackset} @@ -160,63 +193,69 @@ def show_options(args, logger): print('Current Blacklist:') for b in blackcodes.keys(): print(f'{b} - {blackcodes[b]}') - elif args.option == 'virtuals': - - print('Finding datasets with a virtual stacking dimension') - codefile = f'{args.groupdir}/proj_codes/main.txt' - if os.path.isfile(codefile): - with open(codefile) as f: - proj_codes = [r.strip() for r in f.readlines()] - else: + elif args.option == 'parquet' or args.option == 'virtuals': + print(f'Finding datasets that match criteria: {args.option}') + proj_codes = get_codes(args.groupdir, None, 'proj_codes/main.txt') + if not proj_codes: logger.error(f'No proj_codes file found for {args.groupID}') - raise FileNotFoundError(file=codefile) - non_virtual = 0 - for p in proj_codes: + raise FileNotFoundError(file='proj_codes/main.txt') + non_selected = 0 + supplemental = None + for x, p in enumerate(proj_codes): virtual = False + parquet = False if p in blackcodes: continue - detailfile = f'{args.workdir}/in_progress/{args.groupID}/{p}/detail-cfg.json' - if os.path.isfile(detailfile): - with open(detailfile) as f: - details = json.load(f) + + details = get_proj_file( + f'{args.workdir}/in_progress/{args.groupID}/{p}', + 'detail-cfg.json') + if details: if 'virtual_concat' in details: if details['virtual_concat']: virtual = True + if 'type' in details: + if details['type'] == 'parq': + parquet = True + supplemental = details['kerchunk_data'] else: - print(f'Missing detailfile for {p}') + print(f'Missing or obstructed detailfile for {p}') - if virtual: - print(f'Project code: {p} - Virtual present') + if virtual and args.option == 'virtual': + print(f'{x}: {p} - Virtual present') + elif parquet and args.option == 'parquet': + print(f'{x}: {p} - Parquet advised ({supplemental})') else: - non_virtual += 1 - print(f'Non-virtual datasets : {non_virtual}') + non_selected += 1 + if args.option == 'parquet': + print(f'JSON datasets : {non_selected}') + else: + print(f'Non-virtual datasets : {non_selected}') print(f'Total datasets : {len(proj_codes) - len(blackcodes)}') - elif 'variable' in args.option: + elif 'variable' in args.option: raise NotImplementedError - var = args.option.split(':')[-1] - print('Finding datasets with a virtual stacking dimension') - codefile = f'{args.groupdir}/proj_codes/main.txt' - if os.path.isfile(codefile): - with open(codefile) as f: - proj_codes = [r.strip() for r in f.readlines()] - else: - logger.error(f'No proj_codes file found for {args.groupID}') - raise FileNotFoundError(file=codefile) - else: print(f'{args.option} not accepted - use "jobids" or "labels"') -def cleanup(cleantype: str, groupdir: str, logger): +def cleanup(args, logger): + cleantype = args.cleanup + groupdir = args.groupdir """Remove older versions of project code files, error or output logs. Clear directories.""" + cleaned = False if cleantype == 'labels' or cleantype == 'all': projset = glob.glob(f'{groupdir}/proj_codes/*') for p in projset: if 'main' not in p: os.system(f'rm {p}') + cleaned = True if cleantype == 'errors' or cleantype == 'all': os.system(f'rm -rf {groupdir}/errs/*') + cleaned = True if cleantype == 'outputs' or cleantype == 'all': os.system(f'rm -rf {groupdir}/outs/*') + cleaned = True + if not cleaned: + logger.info(f"Cleaning skipped - '{args.cleanup}' is not a known option") def seek_unknown(proj_dir): phase = None @@ -235,6 +274,21 @@ def seek_unknown(proj_dir): log_status(phase, proj_dir, status, FalseLogger()) +def force_datetime_decode(datestamp): + parts = datestamp.split(' ') + if '/' in parts[0]: + date = parts[0] + time = parts[1] + else: + date = parts[1] + time = parts[0] + month, day, year = date.split('/') + if len(str(year)) == 2: + year = '20' + str(year) + hr, mt = time.split(':') + dt = datetime(int(year), int(month), int(day), hour=int(hr), minute=int(mt)) + return dt + def progress_check(args, logger): """Give a general overview of progress within the pipeline - How many datasets currently at each stage of the pipeline @@ -245,6 +299,11 @@ def progress_check(args, logger): blacklist = get_codes(args.groupID, args.workdir, 'blacklist_codes') proj_codes = get_codes(args.groupID, args.workdir, f'proj_codes/{args.repeat_id}') + if args.write: + print('Write permission granted:') + print(' - Will seek status of unknown project codes') + print(' - Will update status with "JobCancelled" for >24hr pending jobs') + groupdir = f'{args.workdir}/groups/{args.groupID}' done_set = {} @@ -254,87 +313,104 @@ def progress_check(args, logger): for b in blacklist: entry = b.replace(' ','').split(',') if entry[1] in extras['blacklist']: - extras['blacklist'][entry[1]] += 1 + extras['blacklist'][entry[1]].append(0) else: - extras['blacklist'][entry[1]] = 1 + extras['blacklist'][entry[1]] = [0] done_set[entry[0]] = True phases = {'init':{}, 'scan': {}, 'compute': {}, 'validate': {}} savecodes = [] longest_err = 0 - for p in proj_codes: - if p not in done_set: - proj_dir = f'{args.workdir}/in_progress/{args.groupID}/{p}' - status_log = f'{proj_dir}/status_log.csv' - if os.path.isfile(status_log): - with open(status_log) as f: - current = f.readlines()[-1].strip() - else: - seek_unknown(proj_dir) - if 'unknown' in extras: - extras['unknown']['no data'] += 1 + for idx, p in enumerate(proj_codes): + try: + if p not in done_set: + proj_dir = f'{args.workdir}/in_progress/{args.groupID}/{p}' + status_log = f'{proj_dir}/status_log.csv' + if os.path.isfile(status_log): + with open(status_log) as f: + current = f.readlines()[-1].strip() else: - extras['unknown'] = {'no data': 1} - continue - entry = current.split(',') - if len(entry[1]) > longest_err: - longest_err = len(entry[1]) - - match_phase = (bool(args.phase) and args.phase == entry[0]) - match_error = (bool(args.error) and args.error == entry[1]) - - if bool(args.phase) != (args.phase == entry[0]): - total_match = False - elif bool(args.error) != (args.error == entry[1]): - total_match = False - else: - total_match = match_phase or match_error + seek_unknown(proj_dir) + if 'unknown' in extras: + extras['unknown']['no data'].append(idx) + else: + extras['unknown'] = {'no data':[idx]} + continue + entry = current.split(',') + if len(entry[1]) > longest_err: + longest_err = len(entry[1]) + + if entry[1] == 'pending' and args.write: + timediff = (datetime.now() - force_datetime_decode(entry[2])).total_seconds() + if timediff > 86400: # 1 Day - fixed for now + entry[1] = 'JobCancelled' + log_status(entry[0], proj_dir, entry[1], FalseLogger()) + + match_phase = (bool(args.phase) and args.phase == entry[0]) + match_error = (bool(args.error) and args.error == entry[1]) + + if bool(args.phase) != (args.phase == entry[0]): + total_match = False + elif bool(args.error) != (args.error == entry[1] or args.error == entry[1].split(' ')[0]): + total_match = False + else: + total_match = match_phase or match_error - if total_match: - if args.examine: - examine_log(args.workdir, p, entry[0], groupID=args.groupID, repeat_id=args.repeat_id, error=entry[1]) - if args.new_id or args.blacklist: - savecodes.append(p) + if total_match: + if args.examine: + examine_log(args.workdir, p, entry[0], groupID=args.groupID, repeat_id=args.repeat_id, error=entry[1]) + if args.new_id or args.blacklist: + savecodes.append(p) - if entry[0] == 'complete': - complete += 1 - else: - if entry[1] in phases[entry[0]]: - phases[entry[0]][entry[1]] += 1 + if entry[0] == 'complete': + complete += 1 else: - phases[entry[0]][entry[1]] = 1 - + if entry[1] in phases[entry[0]]: + phases[entry[0]][entry[1]].append(idx) + else: + phases[entry[0]][entry[1]] = [idx] + except KeyboardInterrupt as err: + raise err + except: + examine_log(args.workdir, p, entry[0], groupID=args.groupID, repeat_id=args.repeat_id, error=entry[1]) + print(f'Issue with analysis of error log: {p}') num_codes = len(proj_codes) print() print(f'Group: {args.groupID}') print(f' Total Codes: {num_codes}') - def summary_dict(pdict, num_codes, status_len=5): + def summary_dict(pdict, num_codes, status_len=5, numbers=0): """Display summary information for a dictionary structure of the expected format.""" for entry in pdict.keys(): pcount = len(list(pdict[entry].keys())) - num_types = sum([pdict[entry][pop] for pop in pdict[entry].keys()]) + num_types = sum([len(pdict[entry][pop]) for pop in pdict[entry].keys()]) if pcount > 0: print() - fmentry = format_str(entry,10) - fmnum_types = format_str(num_types,5) - fmcalc = format_str(f'{num_types*100/num_codes:.1f}',4) + fmentry = format_str(entry,10, concat=False) + fmnum_types = format_str(num_types,5, concat=False) + fmcalc = format_str(f'{num_types*100/num_codes:.1f}',4, concat=False) print(f' {fmentry}: {fmnum_types} [{fmcalc}%] (Variety: {int(pcount)})') - for err in pdict[entry]: - num_errs = pdict[entry][err] - print(f' - {format_str(err, status_len+1)}: {num_errs}') - + # Convert from key : len to key : [list] + errkeys = reversed(sorted(pdict[entry], key=lambda x:len(pdict[entry][x]))) + for err in errkeys: + num_errs = len(pdict[entry][err]) + if num_errs < numbers: + print(f' - {format_str(err, status_len+1, concat=True)}: {num_errs} (IDs = {list(pdict[entry][err])})') + else: + print(f' - {format_str(err, status_len+1, concat=True)}: {num_errs}') if not args.write: print() print('Pipeline Current:') - summary_dict(phases, num_codes, status_len=longest_err) + if not args.long and longest_err > 30: + longest_err = 30 + summary_dict(phases, num_codes, status_len=longest_err, numbers=int(args.numbers)) print() print('Pipeline Complete:') print() complete_percent = format_str(f'{complete*100/num_codes:.1f}',4) print(f' complete : {format_str(complete,5)} [{complete_percent}%]') - summary_dict(extras, num_codes, status_len=longest_err) + summary_dict(extras, num_codes, status_len=longest_err, numbers=0) print() if args.new_id: @@ -350,58 +426,6 @@ def summary_dict(pdict, num_codes, status_len=5): add_to_blacklist(savecodes, args.groupdir, args.reason, logger) else: print('Skipped blacklisting codes - Write flag not present') - - -def status_check(args, logger): - """Check general progress of pipeline for a specific group. - - Lists progress up to the provided phase, options to save all project codes stuck at a specific phase to a repeat_id for later use.""" - - status_check(args, logger) - - """ - if args.phase not in phases: - logger.error(f'Phase not accepted here - {args.phase}') - return None - else: - print(f'Discovering dataset progress within group {args.groupID}') - ignore_pcodes = [] - blacklist = get_blacklist(args.groupdir) - if blacklist: - ignore_pcodes = [b.split(',')[0] for b in blacklist] - print(f'blacklist: {len(blacklist)} datasets') - - for index, phase in enumerate(phases[:-1]): # Ignore complete check as this happens as a byproduct - redo_pcodes, completes = find_codes(phase, args.workdir, args.groupID, checks[index], ignore=ignore_pcodes) - print(f'{phase}: {len(redo_pcodes)} datasets') - if completes: - print(f'complete: {len(completes)} datasets') - if phase == args.phase: - break - ignore_pcodes += redo_pcodes - if args.phase == 'complete': - redo_pcodes = completes - - - # Write pcodes - if not args.repeat_id: - id = 1 - new_projcode_file = f'{args.workdir}/groups/{args.groupID}/proj_codes/{args.phase}_{id}.txt' - while os.path.isfile(new_projcode_file): - id += 1 - new_projcode_file = f'{args.workdir}/groups/{args.groupID}/proj_codes/{args.phase}_{id}.txt' - - args.repeat_id = f'{args.phase}_{id}' - - new_projcode_file = f'{args.workdir}/groups/{args.groupID}/proj_codes/{args.repeat_id}.txt' - - if args.write: - with open(new_projcode_file,'w') as f: - f.write('\n'.join(redo_pcodes)) - - # Written new pcodes - print(f'Written {len(redo_pcodes)} pcodes, repeat label: {args.repeat_id}') - """ def add_to_blacklist(savedcodes, groupdir, reason, logger): """Add a set of codes to the blacklist for a given reason""" @@ -411,13 +435,12 @@ def add_to_blacklist(savedcodes, groupdir, reason, logger): logger.debug(f'Starting blacklist concatenation') merged = '' - with open(blackfile) as f: - blackcodes = [r.strip().split(',') for r in f.readlines()] + blackcodes = get_codes(groupdir, None, 'blacklist_codes') + merged = merge_old_new(blackcodes, savedcodes, index_old=0, reason=reason) blacklist = '\n'.join([f'{m}' for m in merged]) - with open(blackfile,'w') as f: - f.write(blacklist) + set_codes(groupdir, None, 'blacklist_codes', blacklist) print(f'Added {len(merged) - len(blackcodes)} new codes to blacklist') def upgrade_version(args, logger): @@ -447,18 +470,13 @@ def upgrade_version(args, logger): print(f'Upgrading {code} to {args.upgrade}') logger.debug(f'Updating detail-cfg for {code}') - detailfile = f'{proj_dir}/detail-cfg.json' - try: - with open(detailfile) as f: - details = json.load(f) + details = get_proj_file(proj_dir, 'detail-cfg.json') + if details: details['version_no'] = args.upgrade if args.reason: details['version_reason'] = args.reason if args.write: - with open(detailfile,'w') as f: - f.write(json.dumps(details)) - except: - print("Skipped detail step") + set_proj_file(proj_dir, 'detail-cfg.json', details, logger) logger.debug(f'Locating kerchunk file for {code}') in_filename = False @@ -515,19 +533,15 @@ def upgrade_version(args, logger): def analyse_data(g, workdir): """Show some statistics of netcdf and kerchunk data amounts for this particular group""" - ncf, ker, kus = 0, 0, 0 + ncf, ker, kus, nus = 0, 0, 0, 0 complete, scanned = 0, 0 projset = get_codes(g, workdir, 'proj_codes/main') # Add individual error log checking PPC here. for p in projset: - details = None - try: - with open(f'{workdir}/in_progress/{g}/{p}/detail-cfg.json') as f: - details = json.load(f) - except: - pass + proj_dir = f'{workdir}/in_progress/{g}/{p}' + details = get_proj_file(proj_dir, 'detail-cfg.json') if details: scanned += 1 if 'netcdf_data' in details: @@ -535,8 +549,9 @@ def analyse_data(g, workdir): ker += mem_to_val(details['kerchunk_data']) if os.path.isfile(f'{workdir}/in_progress/{g}/{p}/kerchunk-1a.json.complete'): kus += mem_to_val(details['kerchunk_data']) + nus += mem_to_val(details['netcdf_data']) complete += 1 - return ncf, ker, kus, scanned, complete + return ncf, ker, kus, nus, scanned, complete def summary_data(args, logger): """Display summary info in terms of data representation""" @@ -547,22 +562,24 @@ def summary_data(args, logger): else: groups = [args.groupID] - Tncf, Tker, Tkus, Tscan, Tcomp = 0, 0, 0, 0, 0 + Tncf, Tker, Tkus, Tnus, Tscan, Tcomp = 0, 0, 0, 0, 0, 0 for g in groups: print() print(g) - ncf, ker, kus, scanned, complete = analyse_data(g, args.workdir) + ncf, ker, kus, nus, scanned, complete = analyse_data(g, args.workdir) print(f' Datasets : {len(get_codes(g, args.workdir, "proj_codes/main"))}') print(f' - Unavailable : {len(get_codes(g, args.workdir, "blacklist_codes"))}') print(f' Data:') print(f' - NetCDF : {format_float(ncf, FalseLogger())}') print(f' - Kerchunk Estm : {format_float(ker, FalseLogger())} ({scanned})') + print(f' - NetCDF Actual : {format_float(nus, FalseLogger())}') print(f' - Kerchunk Actual: {format_float(kus, FalseLogger())} ({complete})') print() Tncf += ncf Tker += ker Tkus += kus + Tnus += nus Tscan += scanned Tcomp += complete @@ -570,6 +587,7 @@ def summary_data(args, logger): print(f'Total Across {len(groups)} groups') print(f' NetCDF: {format_float(Tncf, FalseLogger())}') print(f' Kerchunk Estm: {format_float(Tker, FalseLogger())} ({Tscan})') + print(f'NetCDF Actual : {format_float(Tnus, FalseLogger())}') print(f'Kerchunk Actual: {format_float(Tkus, FalseLogger())} ({Tcomp})') operations = { @@ -578,6 +596,7 @@ def summary_data(args, logger): 'upgrade': upgrade_version, 'summarise': summary_data, 'display': show_options, + 'cleanup': cleanup, } def assess_main(args): @@ -590,15 +609,9 @@ def assess_main(args): if ',' in args.error: args.error = args.error.split(',') - """ Removed for now - if args.cleanup: - cleanup(args.cleanup, args.groupdir, logger) - return None - """ - if args.groupID == 'A': groups = [] - for d in glob.glob(f'{args.workdir}/groups/'): + for d in glob.glob(f'{args.workdir}/groups/*'): if os.path.isdir(d): groups.append(d.split('/')[-1]) elif ',' in args.groupID: @@ -627,6 +640,7 @@ def assess_main(args): parser.add_argument('-s','--show-opts', dest='option', help='Show options for jobids, labels') parser.add_argument('-c','--clean-up', dest='cleanup', default=None, help='Clean up group directory of errors/outputs/labels') parser.add_argument('-U','--upgrade', dest='upgrade', default=None, help='Upgrade to new version') + parser.add_argument('-l','--long', dest='long', action='store_true', help='Show long error message (no concatenation past 20 chars.)') # Note this will be replaced with upgrader tool at some point # Select subgroups and save new repeat groups @@ -634,6 +648,7 @@ def assess_main(args): parser.add_argument('-p','--phase', dest='phase', default=None, help='Pipeline phase to inspect') parser.add_argument('-r','--repeat_id', dest='repeat_id', default='main', help='Inspect an existing ID for errors') parser.add_argument('-n','--new_id', dest='new_id', default=None, help='Create a new repeat ID, specify selection of codes by phase, error etc.') + parser.add_argument('-N','--numbers', dest='numbers', default=0, help='Show project code numbers for quicker reruns across different errors.') # Error inspection parser.add_argument('-e','--error', dest='error', default='', help='Inspect error of a specific type') From 85c78590c0da1a26a1184d12263e73a56e55a9a8 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Tue, 26 Mar 2024 09:45:08 +0000 Subject: [PATCH 03/20] Added dependent modules for docs testing --- pipeline/utils.py | 118 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 108 insertions(+), 10 deletions(-) diff --git a/pipeline/utils.py b/pipeline/utils.py index 33a8ba3..5eddde8 100644 --- a/pipeline/utils.py +++ b/pipeline/utils.py @@ -3,27 +3,74 @@ __copyright__ = "Copyright 2023 United Kingdom Research and Innovation" import os +import xarray as xr +import json +import fsspec -from pipeline.errors import MissingVariableError +from pipeline.errors import MissingVariableError, MissingKerchunkError, ChunkDataError + +def open_kerchunk(kfile: str, logger, isparq=False, remote_protocol='file'): + """Open kerchunk file from JSON/parquet formats""" + if isparq: + logger.debug('Opening Kerchunk Parquet store') + from fsspec.implementations.reference import ReferenceFileSystem + fs = ReferenceFileSystem( + kfile, + remote_protocol='file', + target_protocol="file", + lazy=True) + return xr.open_dataset( + fs.get_mapper(), + engine="zarr", + backend_kwargs={"consolidated": False, "decode_times": False} + ) + else: + logger.debug('Opening Kerchunk JSON file') + try: + mapper = fsspec.get_mapper('reference://',fo=kfile, target_options={"compression":None}, remote_protocol=remote_protocol) + except json.JSONDecodeError as err: + logger.error(f"Kerchunk file {kfile} appears to be empty") + raise MissingKerchunkError + # Need a safe repeat here + ds = None + attempts = 0 + while attempts < 3 and not ds: + attempts += 1 + try: + ds = xr.open_zarr(mapper, consolidated=False, decode_times=True) + except OverflowError: + ds = None + except Exception as err: + raise err #MissingKerchunkError(message=f'Failed to open kerchunk file {kfile}') + if not ds: + raise ChunkDataError + logger.debug('Successfully opened Kerchunk with virtual xarray ds') + return ds def get_attribute(env: str, args, var: str): """Assemble environment variable or take from passed argument. Finds value of variable from Environment or ParseArgs object, or reports failure """ - if getattr(args, var): - return getattr(args, var) - elif os.getenv(env): + try: + if getattr(args, var): + return getattr(args, var) + except AttributeError: + pass + if os.getenv(env): return os.getenv(env) else: print(var) raise MissingVariableError(type=var) - -def format_str(string: str, length: int): + +def format_str(string: str, length: int, concat=False): """Simple function to format a string to a correct length""" string = str(string) - while len(string) < length: - string += ' ' + if len(string) >= length and concat: + string = string[:length-3] + '...' + else: + while len(string) < length: + string += ' ' return string[:length] class BypassSwitch: @@ -71,5 +118,56 @@ def mem_to_val(value): def get_codes(group, workdir, filename): """Returns a list of the project codes given a filename (repeat id)""" - with open(f'{workdir}/groups/{group}/{filename}.txt') as f: - return [r.strip() for r in f.readlines()] \ No newline at end of file + if workdir: + codefile = f'{workdir}/groups/{group}/{filename}.txt' + else: + codefile = f'{group}/{filename}.txt' + if os.path.isfile(codefile): + with open(codefile) as f: + return [r.strip() for r in f.readlines()] + else: + return [] + +def set_codes(group, workdir, filename, contents, overwrite=0): + codefile = f'{group}/{filename}.txt' + if workdir: + codefile = f'{workdir}/groups/{group}/{filename}.txt' + + ow = 'w' + if overwrite == 1: + ow = 'w+' + + with open(codefile, ow) as f: + f.write(contents) + +def get_proj_file(proj_dir, proj_file): + projfile = f'{proj_dir}/{proj_file}' + if os.path.isfile(projfile): + try: + with open(projfile) as f: + contents = json.load(f) + return contents + except: + with open(projfile) as f: + print(f.readlines()) + return None + else: + return None + +def set_proj_file(proj_dir, proj_file, contents, logger): + projfile = f'{proj_dir}/{proj_file}' + if not os.path.isfile(projfile): + os.system(f'touch {projfile}') + try: + with open(projfile,'w') as f: + f.write(json.dumps(contents)) + logger.debug(f'{proj_file} updated') + except Exception as err: + logger.error(f'{proj_file} unable to update - {err}') + +def get_proj_dir(proj_code, workdir, groupID): + if groupID: + return f'{workdir}/in_progress/{groupID}/{proj_code}' + else: + return f'{workdir}/in_progress/{proj_code}' + From ad22b305e77212a2f2a9cdd0e6fcafe485b158d9 Mon Sep 17 00:00:00 2001 From: dwest77a Date: Tue, 26 Mar 2024 11:06:01 +0000 Subject: [PATCH 04/20] Updated source docs --- docs/source/assess-overview.rst | 1 + docs/source/conf.py | 2 +- docs/source/index.rst | 24 ++++++++++++++++++------ docs/source/pipeline-overview.rst | 2 +- docs/source/pipeline-source.rst | 2 +- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/docs/source/assess-overview.rst b/docs/source/assess-overview.rst index fb1d786..c2c1d0f 100644 --- a/docs/source/assess-overview.rst +++ b/docs/source/assess-overview.rst @@ -8,6 +8,7 @@ The assessor script ```assess.py``` is an all-purpose pipeline checking tool whi An example command to run the assessor tool can be found below: :: + python assess.py Where the operation can be one of the below options: diff --git a/docs/source/conf.py b/docs/source/conf.py index c2689a1..c89df2d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,7 +10,7 @@ import sys sys.path.insert(0, os.path.abspath('../../')) -project = 'Kerchunk Pipeline' +project = 'Padocc Package' copyright = '2024, Daniel Westwood' author = 'Daniel Westwood' release = 'v1.1' diff --git a/docs/source/index.rst b/docs/source/index.rst index 4947bf5..41e521f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,10 +3,22 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to the Kerchunk Pipeline documentation! -=============================================== +Padocc - User Documentation +============================ -**kerchunk-builder** is a Python package for creating sets of Kerchunk files from an archive of NetCDF/HDF/Tiff files. The pipeline makes it easy to create multiple Kerchunk files for different datasets in parallel with validation steps to ensure the outputs are correct. +**padocc** (Pipeline to Aggregate Data for Optimised Cloud Capabilites) is a Python package (package name **kerchunk-builder**) for aggregating data to enable methods of access for cloud-based applications. + +The pipeline makes it easy to generate data-aggregated access patterns in the form of Reference Files or Cloud Formats across different datasets simultaneously with validation steps to ensure the outputs are correct. + +Vast amounts of archival data in a variety of formats can be processed using the package's group mechanics and automatic deployment to a job submission system. + +Currently supported input file formats: + - NetCDF/HDF + - GeoTiff (**coming soon**) + - GRIB (**coming soon**) + - MetOffice (**future**) + +*padocc* is capable of generating both reference files with Kerchunk (JSON or Parquet) and cloud formats like Zarr. The pipeline consists of four central phases, with an additional phase for ingesting/cataloging the produced Kerchunk files. This is not part of the code-base of the pipeline currently but could be added in a future update. @@ -20,7 +32,7 @@ The pipeline consists of four central phases, with an additional phase for inges Introduction Getting Started Example CCI Water Vapour - Pipeline Flags/Options + Padocc Flags/Options Assessor Tool Overview Error Codes @@ -28,9 +40,9 @@ The pipeline consists of four central phases, with an additional phase for inges :maxdepth: 2 :caption: Advanced: - Pipeline Source + Padocc Source Assessor Source - Control Script Source + Control Scripts Source diff --git a/docs/source/pipeline-overview.rst b/docs/source/pipeline-overview.rst index 251df4d..3d1ddc1 100644 --- a/docs/source/pipeline-overview.rst +++ b/docs/source/pipeline-overview.rst @@ -6,7 +6,7 @@ Overview of Pipeline Phases **Init (Initialisation) Phase** -The Kerchunk pipeline takes a CSV (or similar) input file and creates the necessary directories and config files for the pipeline to being running. +The pipeline takes a CSV (or similar) input file and creates the necessary directories and config files for the pipeline to being running. **Scan Phase** diff --git a/docs/source/pipeline-source.rst b/docs/source/pipeline-source.rst index be6094f..de577b1 100644 --- a/docs/source/pipeline-source.rst +++ b/docs/source/pipeline-source.rst @@ -1,4 +1,4 @@ -Pipeline Source Code +Padocc Source Code ==================== ===================== From 34e09e85fbc0b50d618038f4ee08ddba843ecd29 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Tue, 26 Mar 2024 11:07:24 +0000 Subject: [PATCH 05/20] Updated README to point to documentation --- README.md | 106 +++--------------------------------------------------- 1 file changed, 5 insertions(+), 101 deletions(-) diff --git a/README.md b/README.md index 1213372..fdcd4fc 100644 --- a/README.md +++ b/README.md @@ -1,108 +1,12 @@ # kerchunk-builder -A repository for building a kerchunk infrastructure using existing tools, and a set of showcase notebooks to use on example data in this repository. Now a repository under cedadev group! -Example Notebooks: -https://mybinder.org/v2/gh/cedadev/kerchunk-builder.git/main?filepath=showcase/notebooks +The Kerchunk Pipeline (Soon to be renamed) is a Data Aggregation pipeline for creating Kerchunk files to represent various datasets in different original formats. +Currently the Pipeline supports writing JSON/Parquet Kerchunk files for input NetCDF/HDF files. Further developments will allow GeoTiff, GRIB and possibly MetOffice (.pp) files to be represented, as well as using the Pangeo [Rechunker](https://rechunker.readthedocs.io/en/latest/) tool to create Zarr stores for Kerchunk-incompatible datasets. -# Pipeline Phases +[Example Notebooks at this link](https://mybinder.org/v2/gh/cedadev/kerchunk-builder.git/main?filepath=showcase/notebooks) -All pipeline phases are now run using master scripts `single_run.py` or `group_run.py` +[Documentation hosted at this link](https://cedadev.github.io/kerchunk-builder/) -## 0 Activating Environment Settings - -`source build_venv/bin/activate` - -Python virtual environment setup - -`. templates/.sh` - -Sets all environment variables, if a shell script is already present with the correct name. Environment variables to set are: - - WORKDIR: (Required) - Central workspace for saving data - - GROUPDIR: (Required for parallel) - Workspace for a specific group - - SRCDIR: (Required for parallel) - Kerchunk pipeline repo path ending in `/kerchunk-builder` - - KVENV: (Required for parallel) - Path to virtual environment. -All of the above can be passed as flags to each script, or set as environment variables before processing. - -## 1. Running the Pipeline - Examples - -### 1.1 Single running of an isolated dataset -`python single_run.py scan a11x34 -vfbd` - -The above runs the scan process for project code `a11x34` with verbose level 1, forced running (overwrites existing files), bypass errors with `-b` and dry-running with `-d`. Note that running with `-f` and `-d` means that sections will not be skipped if files already exist, but no new files will be generated. - -### 1.2 Single running for a dataset within a group -`python single_run.py scan 0 -vfbd -G CMIP6_exampleset_1 -r scan_2` - -The above has the same features as before, except now we are using project id `0` in place of a project code, with a group ID (`-G`) supplied as well as a repeat ID (`-r`) from which to identify the correct project code from a group. This is an example of what each parallel job will execute, so using this format is solely for test purposes. - -### 1.3 Group running of multiple datasets -`python group_run.py scan CMIP6_exampleset_1 -vfbd -r scan_2` - -The above is the full parallelised job execution command which would activate all jobs with the `single_run.py` script as detailed in section 1.2. This command creates a sbatch file and separate command to start the parallel jobs, which will include all datasets within the `scan_2` subgroup of the `CMIP6_exampleset_1` group. Subgroups are created with the `identify_reruns.py` script. - -### 1.4 Full Worked Example -Using the example documents in this repository, we can run an example group containing just two datasets. Any number of datasets would also follow this method, two is not a unique number other than being the smallest so to minimise duplication. - -#### 1.4.1 Init -The first step is to initialise the group from the example csv given. Here I am giving the group the identifier `UKCP_test1` as the second argument after the phase `init` which we are peforming. With `-i` we supply an input csv (This can also be a text file for some cases where the project code can be generated). Finally `-v` means we get to see general information as the program is running. -`python group_run.py init UKCP_test1 -i examples/UKCP_test1.csv -v` - -#### 1.4.2 Scan -Scanning will give an indication of how long each file will take to produce and some other characteristics which will come into play in later phases. -`python group_run.py scan UKCP_test1 -v` - -If running in `dryrun` mode, this will generate an sbatch submission command like: -`sbatch --array=0-2 /gws/nopw/j04/cmip6_prep_vol1/kerchunk-pipeline/groups/UKCP_test1/sbatch/scan.sbatch` - -Which can be copied into the terminal and executed. Otherwise the jobs will be automatically submitted. - -#### 1.4.3 Compute -`python group_run.py compute UKCP_test1 -vv` - -This command differs only with the level of verboseness, with this many 'v's we will see the debug information as well as general information. Again this will produce an sbatch command to be copied to the terminal if in dryrun mode. - -#### 1.4.4 Validate -`python group_run.py validate UKCP_test1 -vv` - -This final step will submit all datasets for validation, which includes copying the final output file to the `/complete` directory within the workdir set as an environment variable. - - -## 2. Pipeline Phases in detail - -### 2.1 Init -Initialise and configure for running the pipeline for any number of datasets in parallel. -If using the pipeline with a group of datasets, an input file is required (`-i` option) which must be one of: - - A text file containing the wildcard paths describing all files within each dataset (CMIP6) - - A properly formatted CSV with fields for each entry corresponding to the headers: - - Project code: Unique identifier for this dataset, commonly taken from naming conventions in the path - - Pattern/Filename - - Updates: Boolean 1/0 if updates file is present - - Removals: Boolean 1/0 if removals file is present - -### 2.2 Scan -Run kerchunk-scan tool (or similar) to build a test kerchunk file and determine parameters: - - chunks per netcdf file (Nc) - - average chunk size (Tc) - - total expected kerchunk size (Tk) - -### 2.3 Compute -Create parquet store for a specified dataset, using method depending on total expected kerchunk size (Tk) - -#### 2.3.1 Create Kerchunk JSON dataset - -#### 2.3.2 Large Chunkset Tk value - Parallel (Batch) processing - - Batch process to create parts - batch_process/process_wrapper.py - - Combine parts using copier script - combine_refs.py - - Correct metadata (shape, parameters) - correct_meta.py - - Run time correction script if necessary - correct_time.py - -#### 2.3.3 Small Chunkset Tk value - Serial processing -Run create parquet script - create_parq.py -Not currently supported - -### 3. Validate -Run a series of tests on parquet store usage: - - Ensure small plot success with no errors - - Ensure large plot (dask gateway) success with no errors or killed job. +![Kerchunk Pipeline](docs/source/_images/pipeline.png) \ No newline at end of file From 11e56f2ae103883b5d408ce1312171ba8b25ce67 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Tue, 26 Mar 2024 15:34:29 +0000 Subject: [PATCH 06/20] Overhauled docstrings, tidied and optimised --- single_run.py | 340 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 235 insertions(+), 105 deletions(-) diff --git a/single_run.py b/single_run.py index 53c7ec3..f238dc6 100644 --- a/single_run.py +++ b/single_run.py @@ -10,25 +10,59 @@ import logging from datetime import datetime import traceback +import re +# Pipeline Modules from pipeline.logs import init_logger, reset_file_handler, log_status -from pipeline.utils import get_attribute, BypassSwitch, get_codes +from pipeline.utils import get_attribute, BypassSwitch, get_codes, get_proj_file from pipeline.errors import ProjectCodeError, MissingVariableError, BlacklistProjectCode -def run_init(args, logger, fh=None, **kwargs): - """Start initialisation for single dataset""" +def run_init(args, logger, fh=None, **kwargs) -> None: + """ + Start initialisation for single dataset + + :param args: (obj) Set of command line arguments supplied by argparse. + + :param logger: (obj) Logging object for info/debug/error messages. + + :param fh: (str) Path to file for logger I/O when defining new logger. + + :returns: None + """ from pipeline.init import init_config logger.info('Starting init process') - return init_config(args, fh=fh, **kwargs) + init_config(args, fh=fh, **kwargs) + +def run_scan(args, logger, fh=None,**kwargs) -> None: + """ + Start scanning process for individual dataset -def run_scan(args, logger, fh=None,**kwargs): - """Start scanning process for individual dataset""" + :param args: (obj) Set of command line arguments supplied by argparse. + + :param logger: (obj) Logging object for info/debug/error messages. + + :param fh: (str) Path to file for logger I/O when defining new logger. + + :returns: None + """ from pipeline.scan import scan_config logger.info('Starting scan process') - return scan_config(args,fh=fh, **kwargs) + scan_config(args,fh=fh, **kwargs) + +def run_compute(args, logger, fh=None, logid=None, **kwargs) -> None: + """ + Setup computation parameters for individual dataset -def run_compute(args, logger, fh=None, logid=None, **kwargs): - """Setup computation parameters for individual dataset""" + :params args: (obj) Set of command line arguments supplied by argparse. + + :params logger: (obj) Logging object for info/debug/error messages. + + :params fh: (str) Path to file for logger I/O when defining new logger. + + :params logid: (str) Passed to Indexer for specifying a logger component. + + :returns: None + """ from pipeline.compute.serial_process import Indexer logger.info(f'Starting computation step for {args.proj_code}') @@ -64,21 +98,51 @@ def run_compute(args, logger, fh=None, logid=None, **kwargs): if complete and not escape: - return Indexer(args.proj_code, cfg_file=cfg_file, detail_file=detail_file, + t1 = datetime.now() + ds = Indexer(args.proj_code, cfg_file=cfg_file, detail_file=detail_file, workdir=args.workdir, issave_meta=True, thorough=args.quality, forceful=args.forceful, verb=args.verbose, mode=args.mode, version_no=version_no, concat_msg=concat_msg, bypass=args.bypass, groupID=args.groupID, - dryrun=args.dryrun, fh=fh, logid=logid).create_refs() + dryrun=args.dryrun, fh=fh, logid=logid) + ds.create_refs() + + compute_time = (datetime.now()-t1).total_seconds() + + detailfile = f'{args.proj_dir}/detail-cfg.json' + with open(detailfile) as f: + detail = json.load(f) + if 'timings' not in detail: + detail['timings'] = {} + detail['timings']['convert_actual'] = ds.convert_time + detail['timings']['concat_actual'] = ds.concat_time + detail['timings']['compute_actual'] = compute_time + with open(detailfile,'w') as f: + f.write(json.dumps(detail)) + else: logger.error('Output file already exists and there is no plan to overwrite') return None -def run_validation(args, logger, fh=None, **kwargs): - """Start validation of single dataset""" +def run_validation(args, logger, fh=None, **kwargs) -> None: + """ + Start validation of single dataset. + + :param args: (obj) Set of command line arguments supplied by argparse. + + :param logger: (obj) Logging object for info/debug/error messages. + + :param fh: (str) Path to file for logger I/O when defining new logger. + + :returns: None + + """ from pipeline.validate import validate_dataset logger.info('Starting validation process') - return validate_dataset(args, fh=fh, **kwargs) + validate_dataset(args, fh=fh, **kwargs) + + # Note: Validation proved to be unpredictable for timings - not suitable for job allocation. +# Driver functions map to command line input of 'phase' drivers = { 'init':run_init, 'scan':run_scan, @@ -86,8 +150,26 @@ def run_validation(args, logger, fh=None, **kwargs): 'validate': run_validation } -def get_proj_code(workdir: str, group: str, pid, repeat_id, subset=0, id=0): - """Get the correct code given a slurm id from a group of project codes""" +def get_proj_code(workdir: str, group: str, pid, repeat_id, subset=0, id=0) -> str: + """ + Get the correct code given a slurm id from a group of project codes + + :param workdir: (str) The current pipeline working directory. + + :param group: (str) The name of the group which this project code belongs to. + + :param pid: (str) The project code for which to get the index. + + :param repeat_id: (str) The subset within the group (default is main) + + :param subset: (int) The size of the subset within this repeat group. + + :param id: (int) The specific index of this subset within a group. + i.e subset size of 100, total codes is 1000 so 10 codes per subset. + an id value of 2 would mean the third group of 10 codes. + + :returns: The project code (DOI) in string format not index format. + """ try: proj_codes = get_codes(group, workdir, f'proj_codes/{repeat_id}') proj_code = proj_codes[int(id)*subset + pid] @@ -95,21 +177,120 @@ def get_proj_code(workdir: str, group: str, pid, repeat_id, subset=0, id=0): raise ProjectCodeError return proj_code -def blacklisted(proj_code: str, groupdir: str, logger): - blackfile = f'{groupdir}/blacklist_codes.txt' - if os.path.isfile(blackfile): - with open(blackfile) as f: - blackcodes = [r.strip().split(',')[0] for r in f.readlines()] - for code in blackcodes: - if proj_code in code: - return True - return False +def blacklisted(proj_code: str, groupdir: str, logger) -> bool: + """ + Determine if the current project code is blacklisted + + :param groupdir: (str) The path to a group directory within the pipeline + + :param proj_code: (str) The project code in string format (DOI) + + :param logger: (obj) Logging object for info/debug/error messages. + + :returns: True if the project code is in the blacklist, false otherwise. + """ + blackcodes = get_codes(groupdir, None, 'blacklist_codes') + if blackcodes: + return bool(re.match(f'.*{proj_code}.*',''.join(map(str,blackcodes)))) else: logger.debug('No blacklist file preset for this group') return False -def main(args): - """Main function for single run processing""" +def assemble_single_process(args, logger, jobid='', fh=None) -> None: + """ + Process a single task and assemble required parameters. This task may sit within a subset, + repeat id or larger group, but everything from here is concerned with the processing of + a single dataset (task). + + :param args: (obj) Set of command line arguments supplied by argparse. + + :param logger: (obj) Logging object for info/debug/error messages. + + :param jobid: (str) From SLURM_ARRAY_JOB_ID + + :param fh: (str) Path to file for logger I/O when defining new logger. + + :returns: None + """ + + if args.groupID: + + # Avoid stray groupdir definition in environment variables + cmd_groupdir = f'{args.workdir}/groups/{args.groupID}' + if cmd_groupdir != args.groupdir: + logger.warning(f'Overriding environment-defined groupdir value with: {cmd_groupdir}') + args.groupdir = cmd_groupdir + + # Assume using an integer (SLURM_ARRAY_TASK_ID) + proj_code = int(args.proj_code) + + if args.binpack: + # Binpacking requires separate system for getting the right project code + raise NotImplementedError + + args.proj_code = get_proj_code(args.workdir, args.groupID, proj_code, args.repeat_id, subset=args.subset, id=id) + args.proj_dir = f'{args.workdir}/in_progress/{args.groupID}/{args.proj_code}' + + # Get rid of this section if necessary + # Made redundant with use of error logging PPC but still needed - job-error suppression required. + if jobid != '': + errs_dir = f'{args.workdir}/groups/{args.groupID}/errs' + if not os.path.isdir(f'{errs_dir}/{jobid}_{args.repeat_id}'): + os.makedirs(f'{errs_dir}/{jobid}_{args.repeat_id}') + + proj_code_file = f'{args.workdir}/groups/{args.groupID}/proj_codes/{args.repeat_id}.txt' + + if not os.path.isfile(f'{errs_dir}/{jobid}_{args.repeat_id}/proj_codes.txt'): + os.system(f'cp {proj_code_file} {errs_dir}/{jobid}_{args.repeat_id}/proj_codes.txt') + + else: + args.proj_dir = f'{args.workdir}/in_progress/{args.proj_code}' + + #if blacklisted(args.proj_code, args.groupdir, logger) and not args.backtrack: + #raise BlacklistProjectCode + + if not args.phase in drivers: + logger.error(f'"{args.phase}" not recognised, please select from {list(drivers.keys())}') + return None + + logger.debug('Pipeline variables (reconfigured):') + logger.debug(f'WORKDIR : {args.workdir}') + logger.debug(f'GROUPDIR: {args.groupdir}') + logger.debug('Using attributes:') + logger.debug(f'proj_code: {args.proj_code}') + logger.debug(f'proj_dir : {args.proj_dir}') + + # Refresh log for this phase + proj_log = f'{args.proj_dir}/phase_logs/{args.phase}.log' + if not os.path.isdir(f'{args.proj_dir}/phase_logs'): + os.makedirs(f'{args.proj_dir}/phase_logs') + if jobid != '': + if os.path.isfile(proj_log): + os.system(f'rm {proj_log}') + if os.path.isfile(fh): + os.system(f'rm {fh}') + if not args.bypass.skip_report: + log_status(args.phase, args.proj_dir, 'pending', logger, jobid=jobid, dryrun=args.dryrun) + + if jobid != '': + logger = reset_file_handler(logger, args.verbose, proj_log) + drivers[args.phase](args, logger, fh=proj_log, logid=id) + logger = reset_file_handler(logger, args.verbose, fh) + else: + drivers[args.phase](args, logger) + passes += 1 + if not args.bypass.skip_report: + log_status(args.phase, args.proj_dir, 'complete', logger, jobid=jobid, dryrun=args.dryrun) + +def main(args) -> None: + """ + Main function for processing a single job. This could be multiple tasks/datasets within + a single job, but everything from here is serialised, i.e run one after another. + + :param args: (obj) Set of command line arguments supplied by argparse. + + :returns: None + """ jobid = '' fh = '' @@ -149,89 +330,37 @@ def main(args): if args.subset > 1: logger.info(f'Starting process for {id+1}/{args.subset}') try: - if args.groupID: - - # Avoid stray groupdir definition in environment variables - cmd_groupdir = f'{args.workdir}/groups/{args.groupID}' - if cmd_groupdir != args.groupdir: - logger.warning(f'Overriding environment-defined groupdir value with: {cmd_groupdir}') - args.groupdir = cmd_groupdir - - proj_code = int(args.proj_code) - - args.proj_code = get_proj_code(args.workdir, args.groupID, proj_code, args.repeat_id, subset=args.subset, id=id) - args.proj_dir = f'{args.workdir}/in_progress/{args.groupID}/{args.proj_code}' - - # Get rid of this section if necessary - redo to put code list elsewhere - if jobid != '': - errs_dir = f'{args.workdir}/groups/{args.groupID}/errs' - if not os.path.isdir(f'{errs_dir}/{jobid}_{args.repeat_id}'): - os.makedirs(f'{errs_dir}/{jobid}_{args.repeat_id}') + assemble_single_process(args, logger, jobid=jobid, fh=fh) + passes += 1 + except Exception as err: + # Capture all errors - any error handled here is a setup error + # Implement allocation override here - no error thrown if using allocation. - proj_code_file = f'{args.workdir}/groups/{args.groupID}/proj_codes/{args.repeat_id}.txt' + # Add error traceback + tb = traceback.format_exc() + logger.error(tb) - if not os.path.isfile(f'{errs_dir}/{jobid}_{args.repeat_id}/proj_codes.txt'): - os.system(f'cp {proj_code_file} {errs_dir}/{jobid}_{args.repeat_id}/proj_codes.txt') + # Reset file handler back to main. + if jobid != '': + logger = reset_file_handler(logger, args.verbose, fh) + fails += 1 - else: - args.proj_dir = f'{args.workdir}/in_progress/{args.proj_code}' - - #if blacklisted(args.proj_code, args.groupdir, logger) and not args.backtrack: - #raise BlacklistProjectCode - - if args.phase in drivers: - logger.debug('Pipeline variables (reconfigured):') - logger.debug(f'WORKDIR : {args.workdir}') - logger.debug(f'GROUPDIR: {args.groupdir}') - logger.debug('Using attributes:') - logger.debug(f'proj_code: {args.proj_code}') - logger.debug(f'proj_dir : {args.proj_dir}') - - # Refresh log for this phase - proj_log = f'{args.proj_dir}/phase_logs/{args.phase}.log' - if not os.path.isdir(f'{args.proj_dir}/phase_logs'): - os.makedirs(f'{args.proj_dir}/phase_logs') - if jobid != '': - if os.path.isfile(proj_log): - os.system(f'rm {proj_log}') - if os.path.isfile(fh): - os.system(f'rm {fh}') - if not args.bypass.skip_report: - log_status(args.phase, args.proj_dir, 'pending', logger, jobid=jobid, dryrun=args.dryrun) + # Report/log status + if not args.bypass.skip_report: try: - if jobid != '': - logger = reset_file_handler(logger, args.verbose, proj_log) - drivers[args.phase](args, logger, fh=proj_log, logid=id) - logger = reset_file_handler(logger, args.verbose, fh) - else: - drivers[args.phase](args, logger) - passes += 1 - if not args.bypass.skip_report: - log_status(args.phase, args.proj_dir, 'complete', logger, jobid=jobid, dryrun=args.dryrun) - except Exception as err: - # Add error traceback - tb = traceback.format_exc() - logger.error(tb) - - if jobid != '': - logger = reset_file_handler(logger, args.verbose, fh) - fails += 1 - if not args.bypass.skip_report: - try: - status = err.get_str() - except AttributeError: - status = type(err).__name__ + ' ' + str(err) - - # Messes up the csv if there are commas - status = status.replace(',','-') - log_status(args.phase, args.proj_dir, status, logger, jobid=jobid, dryrun=args.dryrun) - else: - raise err + status = err.get_str() + except AttributeError: + status = type(err).__name__ + ' ' + str(err) + + # Messes up the csv if there are commas + status = status.replace(',','-') + log_status(args.phase, args.proj_dir, status, logger, jobid=jobid, dryrun=args.dryrun) + elif not args.binpack: + # Only raise error if we're not bin packing AND skipping the reporting. + # If reporting is skipped, the error is not displayed directly but fails are recorded at the end. + raise err else: - logger.error(f'"{args.phase}" not recognised, please select from {list(drivers.keys())}') - except Exception as err: - # Capture all errors - any error handled here is a setup error - raise err + pass logger.info('Pipeline phase execution finished') logger.info(f'Success: {passes}, Error: {fails}') return True @@ -248,6 +377,7 @@ def main(args): parser.add_argument('-Q','--quality', dest='quality', action='store_true', help='Quality assured checks - thorough run') parser.add_argument('-b','--bypass-errs', dest='bypass', default='DBSCMR', help=BypassSwitch().help()) parser.add_argument('-B','--backtrack', dest='backtrack', action='store_true', help='Backtrack to previous position, remove files that would be created in this job.') + parser.add_argument('-A', '--alloc-bins', dest='binpack',action='store_true', help='input file (for init phase)') # Environment variables parser.add_argument('-w','--workdir', dest='workdir', help='Working directory for pipeline') From 7963c327cb8685f9cd1a980ef40209e5ab4376c5 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Wed, 27 Mar 2024 15:53:32 +0000 Subject: [PATCH 07/20] Docstring and typehint updates, renamed allocator, added evaluate future script --- docs/source/errors.rst | 3 +- docs/source/pipeline-source.rst | 18 +++- pipeline/allocate.py | 53 +++++++++ pipeline/allocator.py | 23 ---- pipeline/evaluate.py | 9 ++ pipeline/utils.py | 183 +++++++++++++++++++++++--------- 6 files changed, 214 insertions(+), 75 deletions(-) create mode 100644 pipeline/allocate.py delete mode 100644 pipeline/allocator.py create mode 100644 pipeline/evaluate.py diff --git a/docs/source/errors.rst b/docs/source/errors.rst index 61f5c6d..1a379fe 100644 --- a/docs/source/errors.rst +++ b/docs/source/errors.rst @@ -4,4 +4,5 @@ Custom Pipeline Errors **A summary of the custom errors that are experienced through running the pipeline.** .. automodule:: pipeline.errors - :members: \ No newline at end of file + :members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/pipeline-source.rst b/docs/source/pipeline-source.rst index de577b1..9bfec8f 100644 --- a/docs/source/pipeline-source.rst +++ b/docs/source/pipeline-source.rst @@ -32,4 +32,20 @@ Validation Module ================= .. automodule:: pipeline.validate - :members: \ No newline at end of file + :members: + +========= +Utilities +========= + +.. automodule:: pipeline.utils + :members: + :show-inheritance: + +======= +Logging +======= + +.. automodule:: pipeline.logs + :members: + :show-inheritance: \ No newline at end of file diff --git a/pipeline/allocate.py b/pipeline/allocate.py new file mode 100644 index 0000000..3f72f21 --- /dev/null +++ b/pipeline/allocate.py @@ -0,0 +1,53 @@ +__author__ = "Daniel Westwood" +__contact__ = "daniel.westwood@stfc.ac.uk" +__copyright__ = "Copyright 2023 United Kingdom Research and Innovation" + +# Job Subset Allocation Script +# - Calculate Expected Utilisation for each Job (500 + Kerchunk Size*5 MB) + +# First-Fit Bin Packing Algorithm +# - Sort Utilisations from largest to smallest +# - Bin Capacity is largest size rounded up to next memory cap (1, 2, 3, 4 GB) +# - Allocate item to first bin with space remaining +# - End with N bins (subsets) - write list of project codes for each subset to a separate file in proj_code_subsets/set_N.txt +# - Run array with number of subsets already set. + +# Utilisation estimate is (total_chunks * 835) + 500 (MB) +""" +for proj_code in (repeat_id set): + open detail-cfg (for this code) + calculate utilisation + add to dict [utilisation, proj_code] + keep track of max/min +get bins using binpacking (pypi) +""" +import binpacking + +from pipeline.utils import get_codes, get_proj_file, get_proj_dir +from pipeline.errors import MissingKerchunkError + +def create_allocation(args): + proj_codes = get_codes(args.groupID, args.workdir, f'proj_codes/{args.repeat_id}') + + time_estms = {} + others = [] + + for p in proj_codes: + proj_dir = get_proj_dir(p, args.workdir, args.groupID) + detail = get_proj_file(proj_dir, 'detail-cfg.json') + if not detail:# or 'skipped' in detail: + raise MissingKerchunkError(f"Detail file not found for {p} - cannot allocate all proj_codes") + if args.phase == 'compute': + # Experimental values for time estimation + if 'timings' in detail: + time_estms[p] = 500 + (2.5 + 1.5*detail['timings']['convert_estm'])*detail['num_files'] + elif 'skipped' in detail: + others.append(p) + + bins = binpacking.to_constant_volume(time_estms, 4*3600) + for b in bins: + print(b) + # Write out as allocations/{label}/bin_0.txt + # Slurm ID is now the ID of the bin + + diff --git a/pipeline/allocator.py b/pipeline/allocator.py deleted file mode 100644 index e5be62a..0000000 --- a/pipeline/allocator.py +++ /dev/null @@ -1,23 +0,0 @@ -__author__ = "Daniel Westwood" -__contact__ = "daniel.westwood@stfc.ac.uk" -__copyright__ = "Copyright 2023 United Kingdom Research and Innovation" - -# Job Subset Allocation Script -# - Calculate Expected Utilisation for each Job (500 + Kerchunk Size*5 MB) - -# First-Fit Bin Packing Algorithm -# - Sort Utilisations from largest to smallest -# - Bin Capacity is largest size rounded up to next memory cap (1, 2, 3, 4 GB) -# - Allocate item to first bin with space remaining -# - End with N bins (subsets) - write list of project codes for each subset to a separate file in proj_code_subsets/set_N.txt -# - Run array with number of subsets already set. - -# Utilisation estimate is (total_chunks * 835) + 500 (MB) -""" -for proj_code in (repeat_id set): - open detail-cfg (for this code) - calculate utilisation - add to dict [utilisation, proj_code] - keep track of max/min -get bins using binpacking (pypi) -""" \ No newline at end of file diff --git a/pipeline/evaluate.py b/pipeline/evaluate.py new file mode 100644 index 0000000..c1fc66c --- /dev/null +++ b/pipeline/evaluate.py @@ -0,0 +1,9 @@ +__author__ = "Daniel Westwood" +__contact__ = "daniel.westwood@stfc.ac.uk" +__copyright__ = "Copyright 2023 United Kingdom Research and Innovation" + +""" +Requirements for CRON + - Determine which jobs have succeeded with one phase. + - Must return a list of the jobs that need to be rerun? +""" \ No newline at end of file diff --git a/pipeline/utils.py b/pipeline/utils.py index 5eddde8..30baf85 100644 --- a/pipeline/utils.py +++ b/pipeline/utils.py @@ -6,11 +6,56 @@ import xarray as xr import json import fsspec +import logging from pipeline.errors import MissingVariableError, MissingKerchunkError, ChunkDataError -def open_kerchunk(kfile: str, logger, isparq=False, remote_protocol='file'): - """Open kerchunk file from JSON/parquet formats""" +class BypassSwitch: + def __init__(self, switch='DBSCMR'): + if switch.startswith('+'): + switch = 'DBSCMR' + switch[1:] + self.switch = switch + if type(switch) == str: + switch = list(switch) + + self.skip_driver = ('D' in switch) + self.skip_boxfail = ('B' in switch) + self.skip_softfail = ('S' in switch) + self.skip_data_sum = ('C' in switch) + self.skip_xkshape = ('X' in switch) + self.skip_report = ('R' in switch) + + # Removed scanfile and memory skips + + def __str__(self): + return self.switch + + def help(self): + return str(""" +Bypass switch options: \n + "F" - * Skip individual file scanning errors. + "D" - * Skip driver failures - Pipeline tries different options for NetCDF (default). + - Only need to turn this skip off if all drivers fail (KerchunkFatalDriverError). + "B" - Skip Box compute errors. + "S" - * Skip Soft fails (NaN-only boxes in validation) (default). + "C" - * Skip calculation (data sum) errors (time array typically cannot be summed) (default). + "M" - Skip memory checks (validate/compute aborts if utilisation estimate exceeds cap). +""") + +def open_kerchunk(kfile: str, logger, isparq=False, remote_protocol='file') -> xr.Dataset: + """ + Open kerchunk file from JSON/parquet formats + + :param kfile: (str) Path to a kerchunk file (or https link if using a remote file) + + :param logger: (obj) Logging object for info/debug/error messages. + + :param isparq: (bool) Switch for using Parquet or JSON Format + + :param remote_protocol: (str) 'file' for local filepaths, 'http' for remote links. + + :returns: An xarray virtual dataset constructed from the Kerchunk file + """ if isparq: logger.debug('Opening Kerchunk Parquet store') from fsspec.implementations.reference import ReferenceFileSystem @@ -47,10 +92,18 @@ def open_kerchunk(kfile: str, logger, isparq=False, remote_protocol='file'): logger.debug('Successfully opened Kerchunk with virtual xarray ds') return ds -def get_attribute(env: str, args, var: str): - """Assemble environment variable or take from passed argument. +def get_attribute(env: str, args, var: str) -> str: + """ + Assemble environment variable or take from passed argument. Find + value of variable from Environment or ParseArgs object, or reports failure. + + :param env: (str) Name of environment variable. + + :param args: (obj) Set of command line arguments supplied by argparse. - Finds value of variable from Environment or ParseArgs object, or reports failure + :param var: (str) Name of argparse parameter to check. + + :returns: Value of either environment variable or argparse value. """ try: if getattr(args, var): @@ -63,8 +116,10 @@ def get_attribute(env: str, args, var: str): print(var) raise MissingVariableError(type=var) -def format_str(string: str, length: int, concat=False): - """Simple function to format a string to a correct length""" +def format_str(string: str, length: int, concat=False) -> str: + """ + Simple function to format a string to a correct length. + """ string = str(string) if len(string) >= length and concat: string = string[:length-3] + '...' @@ -72,41 +127,12 @@ def format_str(string: str, length: int, concat=False): while len(string) < length: string += ' ' return string[:length] + +def mem_to_val(value: str) -> float: + """ + Convert a value in Bytes to an integer number of bytes + """ -class BypassSwitch: - def __init__(self, switch='DBSCMR'): - if switch.startswith('+'): - switch = 'DBSCMR' + switch[1:] - self.switch = switch - if type(switch) == str: - switch = list(switch) - - self.skip_driver = ('D' in switch) - self.skip_boxfail = ('B' in switch) - self.skip_softfail = ('S' in switch) - self.skip_data_sum = ('C' in switch) - self.skip_xkshape = ('X' in switch) - self.skip_report = ('R' in switch) - - # Removed scanfile and memory skips - - def __str__(self): - return self.switch - - def help(self): - return str(""" -Bypass switch options: \n - "F" - * Skip individual file scanning errors. - "D" - * Skip driver failures - Pipeline tries different options for NetCDF (default). - - Only need to turn this skip off if all drivers fail (KerchunkFatalDriverError). - "B" - Skip Box compute errors. - "S" - * Skip Soft fails (NaN-only boxes in validation) (default). - "C" - * Skip calculation (data sum) errors (time array typically cannot be summed) (default). - "M" - Skip memory checks (validate/compute aborts if utilisation estimate exceeds cap). -""") - -def mem_to_val(value): - """Convert a value in Bytes to an integer number of bytes""" suffixes = { 'KB': 1000, 'MB': 1000000, @@ -116,10 +142,25 @@ def mem_to_val(value): suff = suffixes[value.split(' ')[1]] return float(value.split(' ')[0]) * suff -def get_codes(group, workdir, filename): - """Returns a list of the project codes given a filename (repeat id)""" +def get_codes(group: str, workdir: str | None, filename: str, extension='.txt') -> list: + """ + Returns a list of the project codes given a filename (repeat id) + + :param group: (str) Name of current group or path to group directory + (groupdir) in which case workdir can be left as None. + + :param workdir: (str | None) Path to working directory. If this is none, + group value will be assumed as the groupdir path. + + :param filename: (str) Name of text file to access within group (or path + within the groupdir to the text file + + :param extension: (str) For the specific case of non-text-files. + + :returns: A list of codes if the file is found, an empty list otherwise. + """ if workdir: - codefile = f'{workdir}/groups/{group}/{filename}.txt' + codefile = f'{workdir}/groups/{group}/{filename}{extension}' else: codefile = f'{group}/{filename}.txt' if os.path.isfile(codefile): @@ -128,10 +169,31 @@ def get_codes(group, workdir, filename): else: return [] -def set_codes(group, workdir, filename, contents, overwrite=0): +def set_codes(group: str, workdir: str | None, filename: str, contents, extension='.txt', overwrite=0) -> None: + """ + Returns a list of the project codes given a filename (repeat id) + + :param group: (str) Name of current group or path to group directory + (groupdir) in which case workdir can be left as None. + + :param workdir: (str | None) Path to working directory. If this is none, + group value will be assumed as the groupdir path. + + :param filename: (str) Name of text file to access within group (or path + within the groupdir to the text file + + :param contents: (str) Combined contents to write to the file. + + :param extension: (str) For the specific case of non-text-files. + + :param overwrite: (str) Specifier for open() built-in python method, completely + overwrite the file contents or append to existing values. + + :returns: None + """ codefile = f'{group}/{filename}.txt' if workdir: - codefile = f'{workdir}/groups/{group}/{filename}.txt' + codefile = f'{workdir}/groups/{group}/{filename}{extension}' ow = 'w' if overwrite == 1: @@ -140,7 +202,16 @@ def set_codes(group, workdir, filename, contents, overwrite=0): with open(codefile, ow) as f: f.write(contents) -def get_proj_file(proj_dir, proj_file): +def get_proj_file(proj_dir: str, proj_file: str) -> dict | None: + """ + Returns the contents of a project file within a project code directory. + + :param proj_code: (str) The project code in string format (DOI) + + :param proj_file: (str) Name of a file to access within the project directory. + + :returns: A dictionary of the contents of a json file or None if there are problems. + """ projfile = f'{proj_dir}/{proj_file}' if os.path.isfile(projfile): try: @@ -154,7 +225,16 @@ def get_proj_file(proj_dir, proj_file): else: return None -def set_proj_file(proj_dir, proj_file, contents, logger): +def set_proj_file(proj_dir: str, proj_file: str, contents: list, logger: logging.Logger) -> None: + """ + Overwrite the contents of a project file within a project code directory. + + :param proj_code: (str) The project code in string format (DOI) + + :param proj_file: (str) Name of a file to access within the project directory. + + :returns: A dictionary of the contents of a json file or None if there are problems. + """ projfile = f'{proj_dir}/{proj_file}' if not os.path.isfile(projfile): os.system(f'touch {projfile}') @@ -165,9 +245,12 @@ def set_proj_file(proj_dir, proj_file, contents, logger): except Exception as err: logger.error(f'{proj_file} unable to update - {err}') -def get_proj_dir(proj_code, workdir, groupID): +def get_proj_dir(proj_code: str, workdir: str, groupID: str) -> str: + """ + Simple function to assemble the project directory, depends on groupID + May be redundant in the future if a 'serial' directory is added. + """ if groupID: return f'{workdir}/in_progress/{groupID}/{proj_code}' else: return f'{workdir}/in_progress/{proj_code}' - From 00a424c9c7359b36117a0c3e9edad6f08d63e704 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Wed, 27 Mar 2024 15:56:05 +0000 Subject: [PATCH 08/20] Added error/output log suppression - using manual fh switching --- extensions/templates/phase.sbatch.template | 5 ++++- group_run.py | 13 +++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/extensions/templates/phase.sbatch.template b/extensions/templates/phase.sbatch.template index 3e4a934..ba44861 100644 --- a/extensions/templates/phase.sbatch.template +++ b/extensions/templates/phase.sbatch.template @@ -5,10 +5,13 @@ #SBATCH --time={} #SBATCH --mem={} +#SBATCH -o /dev/null +#SBATCH -e /dev/null + module add jaspy source {}/bin/activate export WORKDIR={} export GROUPDIR={} -python {} {} $SLURM_ARRAY_TASK_ID -G {} -t {} -M {} \ No newline at end of file +python {} {} $SLURM_ARRAY_TASK_ID -G {} -t {} -M {} -r {} \ No newline at end of file diff --git a/group_run.py b/group_run.py index 6ab23ac..21247b2 100644 --- a/group_run.py +++ b/group_run.py @@ -111,18 +111,19 @@ def main(args) -> None: if args.memory: mem = args.memory - outdir = f'{args.workdir}/groups/args.groupID/outs/raw/%A_%a.out' - errdir = f'{args.workdir}/groups/{args.groupID}/errs/raw/%A_%a.out' + # Suppressed since now manually logging with changing filehandler. + #outdir = f'{args.workdir}/groups/args.groupID/outs/raw/%A_%a.out' + #errdir = f'{args.workdir}/groups/{args.groupID}/errs/raw/%A_%a.out' - os.system(f'rm -rf {outdir}/*') - os.system(f'rm -rf {errdir}/*') + #os.system(f'rm -rf {outdir}/*') + #os.system(f'rm -rf {errdir}/*') sb = sbatch.format( f'{group}_{phase}_array', # Job name time, # Time mem, # Memory - outdir, - errdir, + #outdir, + #errdir, VENV, args.workdir, args.groupdir, From 53deb41bd0f27a4970f15f600f969a033ff5f969 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Wed, 27 Mar 2024 16:27:25 +0000 Subject: [PATCH 09/20] Updated docstrings/typehints --- pipeline/compute/serial_process.py | 366 +++++++++++++++++++---------- 1 file changed, 241 insertions(+), 125 deletions(-) diff --git a/pipeline/compute/serial_process.py b/pipeline/compute/serial_process.py index 7f69384..ff7cbbc 100644 --- a/pipeline/compute/serial_process.py +++ b/pipeline/compute/serial_process.py @@ -1,4 +1,7 @@ -# Borrows from kerchunk tools but with more automation +__author__ = "Daniel Westwood" +__contact__ = "daniel.westwood@stfc.ac.uk" +__copyright__ = "Copyright 2023 United Kingdom Research and Innovation" + import os import json import sys @@ -10,15 +13,9 @@ import base64 from pipeline.logs import init_logger, FalseLogger -from pipeline.utils import BypassSwitch +from pipeline.utils import BypassSwitch, open_kerchunk, get_proj_file from pipeline.errors import * -from pipeline.validate import validate_data, open_kerchunk, validate_selection - -class KerchunkDriverFatalError(Exception): - - def __init__(self, message="All drivers failed when performing conversion"): - self.message = message - super().__init__(self.message) +from pipeline.validate import validate_selection WORKDIR = None CONCAT_MSG = 'See individual files for more details' @@ -26,58 +23,131 @@ def __init__(self, message="All drivers failed when performing conversion"): class Converter: """Class for converting a single file to a Kerchunk reference object""" - def __init__(self, clogger, bypass_driver=False, ctype=None): - self.logger = clogger - self.ctype = ctype - self.success = True + def __init__(self, clogger, bypass_driver=False, ctype=None) -> None: + self.logger = clogger + self.ctype = ctype + self.success = True self.bypass_driver = bypass_driver - def convert_to_zarr(self, nfile, extension=False, **kwargs): - """Perform conversion to zarr with exceptions for bypassing driver errors.""" + def convert_to_zarr(self, nfile: str, extension=False, **kwargs) -> None: + """ + Perform conversion to zarr with exceptions for bypassing driver errors. + + :param nfile: (str) Path to a local native file of an appropriate + type to be converted. + + :param extension: (str) File extension relating to file type if known. + All extensions/drivers will be tried first, subsequent + files in the same dataset will use whatever extension + worked for the first file as a starting point. + + :returns: The output of performing a driver if successful, None + if the driver is unsuccessful. Errors will be bypassed + if the bypass_driver option is selected for this class. + """ drivers = { 'ncf3': self.ncf3_to_zarr, 'hdf5': self.hdf5_to_zarr, - 'tif' : self.tiff_to_zarr + 'tif' : self.tiff_to_zarr, + 'grib': self.grib_to_zarr, } if extension: self.ctype=extension try: if self.ctype in drivers: - return drivers[self.ctype](nfile, **kwargs) + ref = drivers[self.ctype](nfile, **kwargs) + return ref else: self.logger.debug(f'Extension {self.ctype} not valid') return None except Exception as err: if self.bypass_driver: - pass + return None else: raise err + + def save_individual_ref(self, ref: dict, cache_ref: str, forceful=False) -> None: + """ + Save each individual set of refs created for each file immediately to reduce + loss of progress in the event of a failure somewhere in processing. + """ + if ref and (not os.path.isfile(cache_ref) or forceful): + with open(cache_ref,'w') as f: + f.write(json.dumps(ref)) - def hdf5_to_zarr(self, nfile, **kwargs): - """Converter for HDF5 type files""" + def hdf5_to_zarr(self, nfile: str, **kwargs) -> None: + """Wrapper for converting NetCDF4/HDF5 type files to Kerchunk""" from kerchunk.hdf import SingleHdf5ToZarr return SingleHdf5ToZarr(nfile, **kwargs).translate() - def ncf3_to_zarr(self, nfile, **kwargs): - """Converter for NetCDF3 type files""" + def ncf3_to_zarr(self, nfile: str, **kwargs) -> None: + """Wrapper for converting NetCDF3 type files to Kerchunk""" from kerchunk.netCDF3 import NetCDF3ToZarr return NetCDF3ToZarr(nfile, **kwargs).translate() - def tiff_to_zarr(self, tfile, **kwargs): - """Converter for Tiff type files""" - self.logger.error('Tiff conversion not yet implemented - aborting') - self.success = False - return None + def tiff_to_zarr(self, tfile: str, **kwargs) -> None: + """Wrapper for converting GeoTiff type files to Kerchunk""" + from kerchunk.tiff import TiffToZarr + return TiffToZarr(tfile, **kwargs).translate() + + def grib_to_zarr(self, gfile: str, **kwargs) -> None: + """Wrapper for converting GRIB type files to Kerchunk""" + from kerchunk.grib2 import GribToZarr + return GribToZarr(gfile, **kwargs).translate() class Indexer(Converter): def __init__(self, proj_code, - cfg_file=None, detail_file=None, workdir=WORKDIR, - issave_meta=False, thorough=False, forceful=False, - verb=0, mode=None, version_no='trial-', - concat_msg=CONCAT_MSG, bypass=BypassSwitch(), - groupID=None, limiter=None, dryrun=True, ctype=None, fh=None, logid=None, **kwargs): - """Initialise indexer for this dataset, set all variables and prepare for computation""" + workdir=WORKDIR, thorough=False, forceful=False, + verb=0, mode=None, version_no='trial-', concat_msg=CONCAT_MSG, bypass=BypassSwitch(), + groupID=None, limiter=None, dryrun=True, ctype=None, fh=None, logid=None, **kwargs) -> None: + """ + Initialise indexer for this dataset, set all variables and prepare for computation. + + :param proj_code: (str) The project code in string format (DOI) + + :param workdir: (str) Path to the current working directory. + + :param thorough: (bool) From args.quality - if True will create all files from scratch, + otherwise saved refs from previous runs will be loaded. + + :param forceful: (bool) Continue with processing even if final output file already exists. + + :param verb: (int) From args.verbose - Level of verboseness (see logs.init_logger). + + :param mode: (str) Unused parameter for different logging output mechanisms. + + :param version_no: (str) Kerchunk revision number/identifier. Default is trial - used for + 'scan' phase, will be overridden with specific revision in 'compute' + actual phase. + + :param concat_msg: (str) Value displayed as global attribute for any attributes that + differ across the set of files, instead of a list of the differences, + this message will be used, default can be found above. + + :param bypass: (BypassSwitch) instance of BypassSwitch class containing multiple + bypass/skip options for specific events. See utils.BypassSwitch. + + :param groupID: (str) Name of current dataset group. + + :param limiter: (int) Number of files to process from the whole set of files. Default + value of None will mean all files are processed. Any non-None value + will limit the number of files for processing - utilised in 'scan' phase. + + :param dryrun: (bool) From args.dryrun - if True will prevent output files being generated + or updated and instead will demonstrate commands that would otherwise happen. + + :param ctype: (str) Extension/filetype of the set of files to be processed if already known. + + :param fh: (str) Path to logfile for logger object generated in this specific process. + + :param logid: (str) ID of the process within a subset, which is then added to the name + of the logger - prevents multiple processes with different logfiles getting + loggers confused. + + :returns: None + + """ super().__init__(init_logger(verb, mode, 'compute-serial', fh=fh, logid=logid), bypass_driver=bypass.skip_driver, ctype=ctype) self.logger.debug('Starting variable definitions') @@ -91,17 +161,29 @@ def __init__(self, self.thorough = thorough self.forceful = forceful + self.validate_time = None + self.concat_time = None + self.convert_time = None + self.dryrun = dryrun - self.issave_meta = issave_meta self.updates, self.removals, self.load_refs = False, False, False + if groupID: + self.proj_dir = f'{self.workdir}/in_progress/{groupID}/{self.proj_code}' + else: + self.proj_dir = f'{self.workdir}/in_progress/{self.proj_code}' + self.logger.debug('Loading config information') - with open(cfg_file) as f: - cfg = json.load(f) + self.cfg = get_proj_file(self.proj_dir, 'base-cfg.json') + + self.detail = get_proj_file(self.proj_dir, 'detail-cfg.json') + if not self.detail: + self.detail={} + + with open(f'{self.proj_dir}/allfiles.txt')as f: + self.num_files = len(list(f.readlines())) - self.detailfile = detail_file - with open(detail_file) as f: - self.detail = json.load(f) + self.partial = (self.limiter and self.num_files != self.limiter) if 'virtual_concat' not in self.detail: self.detail['virtual_concat'] = False @@ -110,20 +192,15 @@ def __init__(self, if 'version_no' in self.detail: self.version_no = self.detail['version_no'] - if groupID: - self.proj_dir = f'{self.workdir}/in_progress/{groupID}/{self.proj_code}' - else: - self.proj_dir = f'{self.workdir}/in_progress/{self.proj_code}' - - if 'update' in cfg: + if 'update' in self.cfg: try: - self.updates = dict(cfg['update']) + self.updates = dict(self.cfg['update']) except ValueError: self.logger.warning('Updates attribute not read') self.updates = {} - if 'remove' in cfg: + if 'remove' in self.cfg: try: - self.removals = dict(cfg['remove']) + self.removals = dict(self.cfg['remove']) except ValueError: self.logger.warning('Removal attribute not read') self.removals = {} @@ -133,8 +210,6 @@ def __init__(self, else: self.use_json = True - self.use_json = True - self.outfile = f'{self.proj_dir}/kerchunk-{version_no}a.json' self.outstore = f'{self.proj_dir}/kerchunk-{version_no}a.parq' self.record_size = 167 # Default @@ -162,30 +237,40 @@ def __init__(self, self.set_filelist() self.logger.debug('Finished all setup steps') - def collect_details(self): - """Collect kwargs for combining and any special attributes - save to detail file.""" + def collect_details(self) -> dict: + """ + Collect kwargs for combining and any special attributes - save to detail file. + """ self.detail['combine_kwargs'] = self.combine_kwargs if self.special_attrs: self.detail['special_attrs'] = list(self.special_attrs.keys()) return self.detail - def set_filelist(self): - """Get the list of files from the filelist for this dataset""" + def set_filelist(self) -> None: + """ + Get the list of files from the filelist for this dataset and set + to 'filelist' list. + """ with open(self.filelist) as f: self.listfiles = [r.strip() for r in f.readlines()] if not self.limiter: self.limiter = len(self.listfiles) - def add_download_link(self, refs): - """Add the download link to the Kerchunk references""" + def add_download_link(self, refs: dict) -> dict: + """ + Add the download link to each of the Kerchunk references + """ for key in refs.keys(): if len(refs[key]) == 3: if refs[key][0][0] == '/': refs[key][0] = 'https://dap.ceda.ac.uk' + refs[key][0] return refs - def add_kerchunk_history(self, attrs): - """Add kerchunk variables to the metadata for this dataset""" + def add_kerchunk_history(self, attrs: dict) -> dict: + """ + Add kerchunk variables to the metadata for this dataset, including + creation/update date and version/revision number. + """ from datetime import datetime @@ -210,10 +295,11 @@ def add_kerchunk_history(self, attrs): attrs['kerchunk_creation_date'] = now.strftime("%d%m%yT%H%M%S") return attrs - def find_concat_dims(self, ds_examples): + def find_concat_dims(self, ds_examples: list) -> None: """Find dimensions to use when combining for concatenation - Dimensions which change over the set of files must be concatenated together - - Dimensions which do not change (typically lat/lon) are instead identified as identical_dims""" + - Dimensions which do not change (typically lat/lon) are instead identified as identical_dims + """ concat_dims = [] for dim in ds_examples[0].dims: try: @@ -231,8 +317,9 @@ def find_concat_dims(self, ds_examples): self.detail['virtual_concat'] = True self.combine_kwargs['concat_dims'] = concat_dims - def find_identical_dims(self, ds_examples): - """Find dimensions and variables that are identical across the set of files. + def find_identical_dims(self, ds_examples: list) -> None: + """ + Find dimensions and variables that are identical across the set of files. - Variables which do not change (typically lat/lon) are identified as identical_dims and not concatenated over the set of files. - Variables which do change are concatenated as usual. """ @@ -266,46 +353,52 @@ def find_identical_dims(self, ds_examples): else: self.logger.debug(f'Found {normal_dims} that vary over concatenation_dimensions: {self.combine_kwargs["concat_dims"]}') - def combine_and_save(self, refs, zattrs): - """Concatenation of ref data for different kerchunk schemes""" + def combine_and_save(self, refs: dict, zattrs: dict) -> None: + """ + Concatenation of ref data for different kerchunk schemes + """ self.logger.info('Starting concatenation of refs') - if len(refs) > 1: + if not (len(refs) == 1 or type(refs) == dict): # Pick 2 refs to use when determining dimension info. - if len(refs) == 1 or type(refs) == dict: - pass + # Concatenation Dimensions + if 'combine_kwargs' in self.detail: + self.combine_kwargs = self.detail['combine_kwargs'] else: - # Concatenation Dimensions - if 'combine_kwargs' in self.detail: - self.combine_kwargs = self.detail['combine_kwargs'] + # Calculate Partial Validation Estimate here + t1 = datetime.now() + self.logger.info("Determining concatenation dimensions") + print() + self.find_concat_dims([ + open_kerchunk(refs[0], FalseLogger(), remote_protocol=None), + open_kerchunk(refs[-1], FalseLogger(), remote_protocol=None) + ]) + if self.combine_kwargs['concat_dims'] == []: + self.logger.info(f"No concatenation dimensions available - virtual dimension will be constructed.") else: - self.logger.info("Determining concatenation dimensions") - print() - self.find_concat_dims([ - open_kerchunk(refs[0], FalseLogger()), - open_kerchunk(refs[-1], FalseLogger()) - ]) - if self.combine_kwargs['concat_dims'] == []: - self.logger.info(f"No concatenation dimensions available - virtual dimension will be constructed.") - else: - self.logger.info(f"Found {self.combine_kwargs['concat_dims']} concatenation dimensions.") - print() - - # Identical (Variables) Dimensions - self.logger.info("Determining identical variables") - print() - self.find_identical_dims([ - open_kerchunk(refs[0], FalseLogger()), - open_kerchunk(refs[-1], FalseLogger()) - ]) - self.logger.info(f"Found {self.combine_kwargs['identical_dims']} identical variables.") - print() - + self.logger.info(f"Found {self.combine_kwargs['concat_dims']} concatenation dimensions.") + print() + + # Identical (Variables) Dimensions + self.logger.info("Determining identical variables") + print() + self.find_identical_dims([ + open_kerchunk(refs[0], FalseLogger(), remote_protocol=None), + open_kerchunk(refs[-1], FalseLogger(), remote_protocol=None) + ]) + self.logger.info(f"Found {self.combine_kwargs['identical_dims']} identical variables.") + print() + + # This one only happens for two files so don't need to take a mean + self.validate_time = (datetime.now()-t1).total_seconds() + + t1 = datetime.now() if self.use_json: self.logger.info('Concatenating to JSON format Kerchunk file') self.data_to_json(refs, zattrs) else: self.logger.info('Concatenating to Parquet format Kerchunk store') self.data_to_parq(refs) + self.concat_time = (datetime.now()-t1).total_seconds()/self.limiter if not self.dryrun: self.collect_details() @@ -313,8 +406,11 @@ def combine_and_save(self, refs, zattrs): f.write(json.dumps(self.detail)) self.logger.info("Details updated in detail-cfg.json") - def construct_virtual_dim(self, refs): - """Construct a Virtual dimension for stacking multiple files where no suitable concatenation dimension is present.""" + def construct_virtual_dim(self, refs: dict) -> None: + """ + Construct a Virtual dimension for stacking multiple files + where no suitable concatenation dimension is present. + """ # For now this just means creating a list of numbers 0 to N files vdim = 'file_number' @@ -348,8 +444,11 @@ def construct_virtual_dim(self, refs): ref[f'{vdim}/0'] = values return refs, vdim - def data_to_parq(self, refs): - """Concatenating to Parquet format Kerchunk store""" + def data_to_parq(self, refs: dict) -> None: + """ + Concatenating to Parquet-format Kerchunk store + """ + from kerchunk.combine import MultiZarrToZarr from fsspec import filesystem from fsspec.implementations.reference import LazyReferenceMapper @@ -364,15 +463,19 @@ def data_to_parq(self, refs): refs, out=out, remote_protocol='file', - concat_dims=['time'], **self.combine_kwargs ).translate() - out.flush() - self.logger.info(f'Written to parquet store - {self.proj_code}/kerchunk-1a.parq') + if self.partial: + self.logger.info(f'Skipped writing to parquet store - {self.outstore}') + else: + out.flush() + self.logger.info(f'Written to parquet store - {self.outstore}') - def data_to_json(self, refs, zattrs): - """Concatenating to JSON format Kerchunk file""" + def data_to_json(self, refs: dict, zattrs: dict) -> None: + """ + Concatenating to JSON-format Kerchunk file + """ from kerchunk.combine import MultiZarrToZarr self.logger.debug('Starting JSON-write process') @@ -383,7 +486,6 @@ def data_to_json(self, refs, zattrs): if self.detail['virtual_concat']: refs, vdim = self.construct_virtual_dim(refs) self.combine_kwargs['concat_dims'] = [vdim] - print(self.combine_kwargs) mzz = MultiZarrToZarr(list(refs), **self.combine_kwargs).translate() if zattrs: zattrs = self.add_kerchunk_history(zattrs) @@ -397,15 +499,16 @@ def data_to_json(self, refs, zattrs): # Override global attributes mzz['refs'] = self.add_download_link(mzz['refs']) - if not self.dryrun: + if not self.dryrun and not self.partial: with open(self.outfile,'w') as f: f.write(json.dumps(mzz)) self.logger.info(f'Written to JSON file - {self.outfile}') else: self.logger.info(f'Skipped writing to JSON file - {self.outfile}') - def correct_metadata(self, allzattrs): - """General function for correcting metadata + def correct_metadata(self, allzattrs: dict) -> dict: + """ + General function for correcting metadata - Combine all existing metadata in standard way (cleaning arrays) - Add updates and remove removals specified by configuration """ @@ -434,10 +537,12 @@ def correct_metadata(self, allzattrs): raise ValueError return new_zattrs - def clean_attr_array(self, allzattrs): - """Collect global attributes from all refs: + def clean_attr_array(self, allzattrs: dict) -> dict: + """ + Collect global attributes from all refs: - Determine which differ between refs and apply changes """ + base = json.loads(allzattrs[0]) self.logger.debug('Correcting time attributes') @@ -483,15 +588,17 @@ def clean_attr_array(self, allzattrs): self.logger.debug('Finished checking similar keys') return base - def clean_attrs(self, zattrs): - """Ammend any saved attributes post-combining + def clean_attrs(self, zattrs: dict) -> dict: + """ + Ammend any saved attributes post-combining - Not currently implemented, may be unnecessary """ self.logger.warning('Attribute cleaning post-loading from temp is not implemented') return zattrs - def check_time_attributes(self, times): - """Takes dict of time attributes with lists of values + def check_time_attributes(self, times: dict) -> dict: + """ + Takes dict of time attributes with lists of values - Sort time arrays - Assume time_coverage_start, time_coverage_end, duration (2 or 3 variables) """ @@ -513,12 +620,14 @@ def check_time_attributes(self, times): else: combined[k] = list(set(times[k])) - duration = '' # Need to compare start/end self.logger.debug('Finished time corrections') return combined - def save_metadata(self,zattrs): - """Cache metadata global attributes in a temporary file""" + def save_metadata(self,zattrs: dict) -> dict: + """ + Cache metadata global attributes in a temporary file. + """ + if not self.dryrun: with open(f'{self.cache}/temp_zattrs.json','w') as f: f.write(json.dumps(zattrs)) @@ -526,13 +635,13 @@ def save_metadata(self,zattrs): else: self.logger.debug('Skipped saving global attribute cache') - def save_individual_ref(self, ref, cache_ref): - if not os.path.isfile(cache_ref) or self.forceful: - with open(cache_ref,'w') as f: - f.write(json.dumps(ref)) + def try_all_drivers(self, nfile: str, **kwargs) -> dict | None: + """ + Safe creation allows for known issues and tries multiple drivers - def try_all_drivers(self, nfile, **kwargs): - """Safe creation allows for known issues and tries multiple drivers""" + :returns: dictionary of Kerchunk references if successful, raises error + otherwise if unsuccessful. + """ extension = False supported_extensions = ['ncf3','hdf5','tif'] @@ -561,7 +670,11 @@ def try_all_drivers(self, nfile, **kwargs): self.logger.debug(f'Scan successful with {self.ctype} driver') return tdict - def load_temp_zattrs(self): + def load_temp_zattrs(self) -> dict: + """ + Load global attributes from a 'temporary' cache file. + """ + self.logger.debug(f'Loading attributes') try: with open(f'{self.cache}/temp_zattrs.json') as f: @@ -573,7 +686,7 @@ def load_temp_zattrs(self): return None return zattrs - def create_refs(self): + def create_refs(self) -> None: """Organise creation and loading of refs - Load existing cached refs - Create new refs @@ -586,16 +699,17 @@ def create_refs(self): use_temp_zattrs = True # Attempt to load existing file - create if not exists already + t1 = datetime.now() for x, nfile in enumerate(self.listfiles[:self.limiter]): cache_ref = f'{self.cache}/{x}.json' ref = None if os.path.isfile(cache_ref) and not self.thorough: - self.logger.info(f'Loading refs: {x+1}/{len(self.listfiles)}') + self.logger.info(f'Loading refs: {x+1}/{self.limiter}') if os.path.isfile(cache_ref): with open(cache_ref) as f: ref = json.load(f) if not ref: - self.logger.info(f'Creating refs: {x+1}/{len(self.listfiles)}') + self.logger.info(f'Creating refs: {x+1}/{self.limiter}') try: ref = self.try_all_drivers(nfile, **self.create_kwargs) except KerchunkDriverFatalError as err: @@ -608,7 +722,9 @@ def create_refs(self): allzattrs.append(ref['refs']['.zattrs']) refs.append(ref) cache_ref = f'{self.cache}/{x}.json' - self.save_individual_ref(ref, cache_ref) + self.save_individual_ref(ref, cache_ref, forceful=self.forceful) + # Compute mean conversion time for this set. + self.convert_time = (datetime.now()-t1).total_seconds()/self.limiter if len(partials) > 0: raise PartialDriverError(filenums=partials) From 6dca49db6a494a2fb3035ddcb0654f794652512c Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:29:05 +0000 Subject: [PATCH 10/20] Removed old scripts, renamed compute script --- .../{compute/serial_process.py => compute.py} | 145 ++++++++++++++++-- pipeline/compute/parallel/batch_process.py | 53 ------- pipeline/compute/parallel/combine_refs.py | 35 ----- pipeline/compute/parallel/correct_meta.py | 21 --- pipeline/compute/parallel/correct_time.py | 14 -- pipeline/compute/parallel/process_wrapper.py | 80 ---------- 6 files changed, 133 insertions(+), 215 deletions(-) rename pipeline/{compute/serial_process.py => compute.py} (85%) delete mode 100644 pipeline/compute/parallel/batch_process.py delete mode 100644 pipeline/compute/parallel/combine_refs.py delete mode 100644 pipeline/compute/parallel/correct_meta.py delete mode 100644 pipeline/compute/parallel/correct_time.py delete mode 100644 pipeline/compute/parallel/process_wrapper.py diff --git a/pipeline/compute/serial_process.py b/pipeline/compute.py similarity index 85% rename from pipeline/compute/serial_process.py rename to pipeline/compute.py index ff7cbbc..7c5d0cd 100644 --- a/pipeline/compute/serial_process.py +++ b/pipeline/compute.py @@ -13,14 +13,14 @@ import base64 from pipeline.logs import init_logger, FalseLogger -from pipeline.utils import BypassSwitch, open_kerchunk, get_proj_file +from pipeline.utils import BypassSwitch, open_kerchunk, get_proj_file, set_proj_file from pipeline.errors import * from pipeline.validate import validate_selection WORKDIR = None CONCAT_MSG = 'See individual files for more details' -class Converter: +class KerchunkConverter: """Class for converting a single file to a Kerchunk reference object""" def __init__(self, clogger, bypass_driver=False, ctype=None) -> None: @@ -28,6 +28,7 @@ def __init__(self, clogger, bypass_driver=False, ctype=None) -> None: self.ctype = ctype self.success = True self.bypass_driver = bypass_driver + self.loaded_refs = False def convert_to_zarr(self, nfile: str, extension=False, **kwargs) -> None: """ @@ -75,6 +76,13 @@ def save_individual_ref(self, ref: dict, cache_ref: str, forceful=False) -> None with open(cache_ref,'w') as f: f.write(json.dumps(ref)) + def load_individual_ref(self, cache_ref: str) -> dict | None: + """Wrapper for getting proj_file cache_ref.""" + ref = get_proj_file(cache_ref, None) + if ref: + self.loaded_refs = True + return ref + def hdf5_to_zarr(self, nfile: str, **kwargs) -> None: """Wrapper for converting NetCDF4/HDF5 type files to Kerchunk""" from kerchunk.hdf import SingleHdf5ToZarr @@ -95,14 +103,15 @@ def grib_to_zarr(self, gfile: str, **kwargs) -> None: from kerchunk.grib2 import GribToZarr return GribToZarr(gfile, **kwargs).translate() -class Indexer(Converter): +class KerchunkDSProcessor(KerchunkConverter): def __init__(self, proj_code, workdir=WORKDIR, thorough=False, forceful=False, verb=0, mode=None, version_no='trial-', concat_msg=CONCAT_MSG, bypass=BypassSwitch(), - groupID=None, limiter=None, dryrun=True, ctype=None, fh=None, logid=None, **kwargs) -> None: + groupID=None, limiter=None, dryrun=True, ctype=None, fh=None, logid=None, + skip_concat=False, logger=None, **kwargs) -> None: """ - Initialise indexer for this dataset, set all variables and prepare for computation. + Initialise KerchunkDSProcessor for this dataset, set all variables and prepare for computation. :param proj_code: (str) The project code in string format (DOI) @@ -145,10 +154,15 @@ def __init__(self, of the logger - prevents multiple processes with different logfiles getting loggers confused. + :param skip_concat: (bool) Internal parameter for skipping concat - used for parallel construction + which requires a more complex job allocation. + :returns: None """ - super().__init__(init_logger(verb, mode, 'compute-serial', fh=fh, logid=logid), bypass_driver=bypass.skip_driver, ctype=ctype) + if not logger: + logger = init_logger(verb, mode, 'compute-serial', fh=fh, logid=logid) + super().__init__(logger, bypass_driver=bypass.skip_driver, ctype=ctype) self.logger.debug('Starting variable definitions') @@ -160,6 +174,7 @@ def __init__(self, self.concat_msg = concat_msg self.thorough = thorough self.forceful = forceful + self.skip_concat= skip_concat self.validate_time = None self.concat_time = None @@ -245,6 +260,20 @@ def collect_details(self) -> dict: if self.special_attrs: self.detail['special_attrs'] = list(self.special_attrs.keys()) return self.detail + + def get_timings(self) -> dict | None: + """ + Export timed values if refs were all created from scratch. + Ref loading invalidates timings so returns None if any refs were loaded + not created. + """ + timings = None + if not self.loaded_refs: + timings = { + 'convert_actual': self.convert_time, + 'concat_actual' : self.concat_time + } + return timings def set_filelist(self) -> None: """ @@ -703,11 +732,10 @@ def create_refs(self) -> None: for x, nfile in enumerate(self.listfiles[:self.limiter]): cache_ref = f'{self.cache}/{x}.json' ref = None - if os.path.isfile(cache_ref) and not self.thorough: - self.logger.info(f'Loading refs: {x+1}/{self.limiter}') - if os.path.isfile(cache_ref): - with open(cache_ref) as f: - ref = json.load(f) + if not self.thorough: + ref = self.load_individual_ref(cache_ref) + if ref: + self.logger.info(f'Loaded refs: {x+1}/{self.limiter}') if not ref: self.logger.info(f'Creating refs: {x+1}/{self.limiter}') try: @@ -735,12 +763,105 @@ def create_refs(self) -> None: zattrs = self.correct_metadata(allzattrs) try: - if self.success: + if self.success and not self.skip_concat: self.combine_and_save(refs, zattrs) except Exception as err: # Any additional parts here. raise err +class ZarrRechunker(KerchunkConverter): + """ + Rechunk input data types directly into zarr using Pangeo Rechunker. + - If refs already exist from previous Kerchunk runs, can use these to inform rechunker. + - Otherwise will have to start from scratch. + """ + def __init__(self): + raise NotImplementedError + +def configure_kerchunk(args, logger, fh=None, logid=None): + """ + Configure all required steps for Kerchunk processing. + - Check if output files already exist. + - Configure timings post-run. + """ + version_no = 1 + complete, escape = False, False + while not (complete or escape): + out_json = f'{args.proj_dir}/kerchunk-{version_no}a.json' + out_parq = f'{args.proj_dir}/kerchunk-{version_no}a.parq' + + if os.path.isfile(out_json) or os.path.isfile(out_parq): + if args.forceful: + complete = True + elif args.new_version: + version_no += 1 + else: + escape = True + else: + complete = True + + concat_msg = '' # CMIP and CCI may be different? + + if complete and not escape: + + t1 = datetime.now() + ds = KerchunkDSProcessor(args.proj_code, + workdir=args.workdir,thorough=args.quality, forceful=args.forceful, + verb=args.verbose, mode=args.mode, + version_no=version_no, concat_msg=concat_msg, bypass=args.bypass, groupID=args.groupID, + dryrun=args.dryrun, fh=fh, logid=logid) + ds.create_refs() + + compute_time = (datetime.now()-t1).total_seconds() + + detail = get_proj_file(args.proj_dir, 'detail-cfg.json') + if 'timings' not in detail: + detail['timings'] = {} + + timings = ds.get_timings() + if timings: + logger.info('Export timings for this process - all refs created from scratch.') + detail['timings']['convert_actual'] = timings['convert_actual'] + detail['timings']['concat_actual'] = timings['concat_actual'] + detail['timings']['compute_actual'] = compute_time + set_proj_file(args.proj_dir, 'detail-cfg.json', detail, logger) + + else: + logger.error('Output file already exists and there is no plan to overwrite') + return None + +def configure_zarr(args, logger): + raise NotImplementedError + +def compute_config(args, fh=None, logid=None, **kwargs) -> None: + """ + Will serve as main point of configuration for processing runs. + Must be able to assess between using Zarr/Kerchunk. + """ + + logger = init_logger(args.verbose, args.mode, 'compute-serial', fh=fh, logid=logid) + + logger.info(f'Starting computation step for {args.proj_code}') + + cfg_file = f'{args.proj_dir}/base-cfg.json' + detail_file = f'{args.proj_dir}/detail-cfg.json' + + # Preliminary checks + if not os.path.isfile(cfg_file): + logger.error(f'cfg file missing or not provided - {cfg_file}') + raise FileNotFoundError(cfg_file) + + if not os.path.isfile(detail_file): + logger.error(f'cfg file missing or not provided - {detail_file}') + raise FileNotFoundError(detail_file) + + # Open the detailfile to check type. + detail = get_proj_file(args.proj_dir, 'detail-cfg.json') + if detail['type'] == 'Zarr': + configure_zarr(args, logger) + else: + configure_kerchunk(args, logger) + if __name__ == '__main__': print('Serial Processor for Kerchunk Pipeline - run with single_run.py') \ No newline at end of file diff --git a/pipeline/compute/parallel/batch_process.py b/pipeline/compute/parallel/batch_process.py deleted file mode 100644 index bff97ef..0000000 --- a/pipeline/compute/parallel/batch_process.py +++ /dev/null @@ -1,53 +0,0 @@ -from kerchunk import hdf, combine, df -import fsspec.implementations.reference -from fsspec.implementations.reference import LazyReferenceMapper -from tempfile import TemporaryDirectory - -import matplotlib.pyplot as plt - -import json - -import xarray as xr -import os, sys - -VERBOSE = True - -def vprint(msg): - if VERBOSE: - print('[INFO]', msg) - -tasks = sys.argv[-1] -id = sys.argv[-2] -DEV = '/home/users/dwest77/Documents/kerchunk_dev/kerchunk-builder/' -PATH = '/gws/nopw/j04/esacci_portal/kerchunk/parq/ocean_daily_all_parts' -pq = f'{PATH}/batch{id}' -with open(f'{DEV}/test_parqs/filelists/gargant.txt') as f: - files = [r.split('\n')[0] for r in f.readlines()] - -fcount = len(files) -files_per_task = int(fcount / int(tasks)) - -subset = files[int(files_per_task*int(id)):int(files_per_task*(int(id)+1))] - -try: - os.makedirs(pq) -except: - pass - -single_ref_sets = [] -for url in subset: - vprint(url) - single_ref_sets.append(hdf.SingleHdf5ToZarr(url, inline_threshold=-1).translate()) -vprint('Kerchunked all files') -out = LazyReferenceMapper.create(100, pq, fs = fsspec.filesystem("file")) -vprint('Created Lazy Reference Mapper') -out_dict = combine.MultiZarrToZarr( - single_ref_sets, - remote_protocol="file", - concat_dims=["time"], - out=out).translate() -vprint('Written to Parquet Store') - -out.flush() -vprint('Completed Flush') - diff --git a/pipeline/compute/parallel/combine_refs.py b/pipeline/compute/parallel/combine_refs.py deleted file mode 100644 index b26eba8..0000000 --- a/pipeline/compute/parallel/combine_refs.py +++ /dev/null @@ -1,35 +0,0 @@ -import os - - -PARTS = 'esacci7_parts' -FULL = 'esacci7_full' - - -# Combine metadatae into a single zmeta directory -if not os.path.isdir(f'batch/{FULL}'): - os.makedirs(f'batch/{FULL}') -varnames = [] -for dirname in os.listdir(f'batch/{PARTS}/batch0'): - if dirname != '.zmetadata': - try: - os.makedirs(f'batch/{FULL}/{dirname}') - except: - pass - varnames.append(dirname) - -specials = {'lat':1, 'lon':1} -repeat = 76 -#for varname in varnames: -if True: - varname = 'time' - print(varname) - refid = 0 - if varname in specials: - repeat = specials[varname] - - for index in range(repeat): - directory = f'batch/{PARTS}/batch{index}/{varname}' - for ref in os.listdir(directory): - #if not os.path.isfile(f'batch/{FULL}/{varname}/refs.{refid}.parq'): - os.system(f'cp {directory}/{ref} batch/{FULL}/{varname}/refs.{refid}.parq') - refid += 1 diff --git a/pipeline/compute/parallel/correct_meta.py b/pipeline/compute/parallel/correct_meta.py deleted file mode 100644 index 181800f..0000000 --- a/pipeline/compute/parallel/correct_meta.py +++ /dev/null @@ -1,21 +0,0 @@ -# Correct shapes and chunks -import json - - -old = 4 -new = 304 -PATH = '/home/users/dwest77/Documents/kerchunk_dev/parquet/dev/batch/esacci9_full' -with open(f'{PATH}/.zmetadata') as f: - refs = json.load(f) - -meta = refs['metadata'] - -for key in meta.keys(): - if '.zarray' in key: - # Correct chunks - if meta[key]['shape'][0] == old: - meta[key]['shape'][0] = new - -refs['metadata'] = meta -with open(f'{PATH}/.zmetadata','w') as f: - f.write(json.dumps(refs)) \ No newline at end of file diff --git a/pipeline/compute/parallel/correct_time.py b/pipeline/compute/parallel/correct_time.py deleted file mode 100644 index a71e6ad..0000000 --- a/pipeline/compute/parallel/correct_time.py +++ /dev/null @@ -1,14 +0,0 @@ -import pandas as pd - -PATH = '/home/users/dwest77/Documents/kerchunk_dev/parquet/dev' -raw = None -for x in range(0,76): - df = pd.read_parquet(f'{PATH}/batch/esacci7_full/time/refs.{x}.parq') - if not raw: - raw = df['raw'][0] - else: - raw += df['raw'][0] - -df.to_parquet(f'{PATH}/batch/esacci7_full/time/refs.0.parq') - -#df.to_csv('time0.7.csv') \ No newline at end of file diff --git a/pipeline/compute/parallel/process_wrapper.py b/pipeline/compute/parallel/process_wrapper.py deleted file mode 100644 index 19f841c..0000000 --- a/pipeline/compute/parallel/process_wrapper.py +++ /dev/null @@ -1,80 +0,0 @@ -#python -import os -import sys -from getopt import getopt -import numpy as np - -BASE = '/home/users/dwest77/Documents/kerchunk_dev/kerchunk-builder' - -PATH = '/home/users/dwest77/Documents/kerchunk_dev/kerchunk-builder/temp/ocean-daily-all' - -dirs = [ - f'{PATH}/outs', - f'{PATH}/errs', - f'{PATH}/jbs_sbatch', - f'{PATH}/filelists' -] - -def mkfiles(p): - if not os.path.isdir(p): - os.makedirs(p) - else: - os.system(f'rm -rf {p}/*') - -for d in dirs: - mkfiles(d) - -SBATCH = """#!/bin/bash -#SBATCH --partition=short-serial-4hr -#SBATCH --account=short4hr -#SBATCH --job-name={} - -#SBATCH --time={} -#SBATCH --time-min=10:00 -#SBATCH --mem=2G - -#SBATCH -o {} -#SBATCH -e {} -{} - -module add jaspy -source {} -python {} {} -""" - -def format_sbatch(jobname, time, outs, errs, dependency, venvpath, script, cmdargs): - outs = f'{PATH}/outs/{outs}' - errs = f'{PATH}/errs/{errs}' - return SBATCH.format( - jobname, - time, - outs, - errs, - dependency, - venvpath, - script, - cmdargs) - -with open(f'{BASE}/test_parqs/filelists/gargant.txt') as f: - files = [r.split('\n')[0] for r in f.readlines()] - -fcount = 160 #len(files)/4 - -VENVPATH = '/home/users/dwest77/Documents/kerchunk_dev/kerchunk-builder/build_venv/bin/activate' -script = f'{BASE}/processing/parallel/batch_process.py' -cmdargs = '$SLURM_ARRAY_TASK_ID 1600' - -arrayjob = format_sbatch( - 'parq_%A_%a', - '30:00', - '%A_%a.out', - '%A_%a.err', - '', - VENVPATH, - script, - cmdargs -) -with open(f'{PATH}/control.sbatch','w') as f: - f.write(arrayjob) -print(fcount) -os.system(f'sbatch --array=0-{int(fcount-1)} {PATH}/control.sbatch') \ No newline at end of file From 0a7262e20c1e6b443efadf9babe024d4a49309fe Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:29:22 +0000 Subject: [PATCH 11/20] Remove old outdated scripts from current version --- pipeline/old/create_group.py | 87 ------------------------------------ pipeline/old/wide_config.py | 51 --------------------- pipeline/old/~compute.py | 63 -------------------------- 3 files changed, 201 deletions(-) delete mode 100644 pipeline/old/create_group.py delete mode 100644 pipeline/old/wide_config.py delete mode 100644 pipeline/old/~compute.py diff --git a/pipeline/old/create_group.py b/pipeline/old/create_group.py deleted file mode 100644 index 4c3a848..0000000 --- a/pipeline/old/create_group.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- -import sys -import json -import os -import random - -config = { - 'proj_code': None, - 'workdir': None, - 'proj_dir':None, - 'pattern': None, - 'update': None, - 'remove': None -} - -general = "/badc/cmip6/data/CMIP6/" -groupdir = '' -workdir = '' - -# List 100 random CMIP datasets - -def get_CMIP_data_recursive(path): - contents = [] - for c in os.listdir(path): - if os.path.isdir(os.path.join(path,c)): - contents.append(c) - if len(contents) > 0: - randsel = contents[random.randint(0,len(contents)-1)] - return get_CMIP_data_recursive(os.path.join(path, randsel)) - else: - return path - -def get_proj_code(path, prefix=''): - return path.replace(prefix,'').replace('/','_') - -def get_fpaths(): - file = f'{groupdir}/CMIP6_rand100_00/proj_codes.txt' - with open(file) as f: - contents = [r.strip() for r in f.readlines()] - return contents - -def test_cmip6(): - fpaths = get_fpaths() - word = '' - for x in range(400): - print(x) - fpath = get_CMIP_data_recursive(general) - while fpath in fpaths: - fpath = get_CMIP_data_recursive(general) - proj_code = get_proj_code(fpath) - workdir = '/gws/nopw/j04/esacci_portal/kerchunk/pipeline/in_progress' - proj_dir = f'{workdir}/{proj_code}' - pattern = f'{os.path.realpath(fpath)}/*.nc' - word += f'{proj_code},{workdir},{proj_dir},{pattern},,\n' - - if not os.path.isdir(f'{groupdir}/CMIP6_rand400_00'): - os.makedirs(f'{groupdir}/CMIP6_rand400_00') - - with open(f'{groupdir}/CMIP6_rand400_00/datasets.csv','w') as f: - f.write(word) - print('Wrote 100 datasets to config group CMIP6_rand100_00') - -if __name__ == '__main__': - # Get a list of paths from some input file - # For each path, get project_code, workdir, proj_dir, pattern. - - group = sys.argv[1] - prefix = sys.argv[2] - - groupdir = os.environ['GROUPDIR'] - workdir = os.environ['WORKDIR'] - - with open(f'{groupdir}/filelists/{group}.txt') as f: - datasets = [r.strip() for r in f.readlines()] - records = '' - for ds in datasets: - proj_code = get_proj_code(ds, prefix=prefix) - proj_dir = f'{workdir}/{group}/{proj_code}' - pattern = f'{os.path.realpath(ds)}/*.nc' - records += f'{proj_code},{workdir},{proj_dir},{pattern},,\n' - - if not os.path.isdir(f'{groupdir}/{group}'): - os.makedirs(f'{groupdir}/{group}') - - with open(f'{groupdir}/{group}/datasets.csv','w') as f: - f.write(records) - print(f"Wrote {len(datasets)} datasets to config group {group}") diff --git a/pipeline/old/wide_config.py b/pipeline/old/wide_config.py deleted file mode 100644 index 45557cf..0000000 --- a/pipeline/old/wide_config.py +++ /dev/null @@ -1,51 +0,0 @@ -import sys -import json -import os - -config = { - 'proj_code': None, - 'workdir': None, - 'proj_dir':None, - 'pattern': None, - 'update': None, - 'remove': None -} - -if __name__ == '__main__': - csvfile = sys.argv[1] - - groupdir = os.environ['GROUPDIR'] - groupid = csvfile.split('/')[-2] - - # Open csv and gather data - with open(f'{groupdir}/{csvfile}') as f: - datasets = {r.strip().split(',')[0]:r.strip().split(',')[1:] for r in f.readlines()[:]} - - # Configure for each dataset - params = list(config.keys()) - proj_codes = list(datasets.keys()) - for dsk in proj_codes: - ds = datasets[dsk] - cfg = dict(config) - cfg[params[0]] = dsk - for x, p in enumerate(params[1:]): - cfg[p] = ds[x] - - # Save config file - if not os.path.isdir(cfg['proj_dir']): - os.makedirs(cfg['proj_dir']) - - with open(f'{cfg["proj_dir"]}/base-cfg.json','w') as f: - f.write(json.dumps(cfg)) - - else: - print(f'{cfg["proj_code"]} already exists - skipping') - - print(f'Exported {len(proj_codes)} dataset config files') - - if not os.path.isdir(f'{groupdir}/{groupid}'): - os.makedirs(f'{groupdir}/{groupid}') - with open(f'{groupdir}/{groupid}/proj_codes.txt','w') as f: - f.write('\n'.join(proj_codes)) - - print('Written as group ID:',groupid) \ No newline at end of file diff --git a/pipeline/old/~compute.py b/pipeline/old/~compute.py deleted file mode 100644 index c8e4a5e..0000000 --- a/pipeline/old/~compute.py +++ /dev/null @@ -1,63 +0,0 @@ -# Main script for processing, runs all other parts as needed including submitting batch jobs for large parquet sets. -import sys -import os -import json - -from serial.CFG_create_kerchunk import Indexer - -def rundecode(cfgs): - """ - cfgs - list of command inputs depending on user input to this program - """ - flags = { - '-w': 'workdir', - '-i': 'groupid', - '-g': 'groupdir' - } - kwargs = {} - for x in range(0,int(len(cfgs)),2): - try: - flag = flags[cfgs[x]] - kwargs[flag] = cfgs[x+1] - except KeyError: - print('Unrecognised cmdarg:',cfgs[x:x+1]) - - return kwargs - -def setup_compute(proj_code, workdir=None, **kwargs): - if os.getenv('KERCHUNK_DIR'): - workdir = os.getenv('KERCHUNK_DIR') - - cfg_file = f'{workdir}/in_progress/{proj_code}/base-cfg.json' - if os.path.isfile(cfg_file): - with open(cfg_file) as f: - cfg = json.load(f) - else: - print(f'Error: cfg file missing or not provided - {cfg_file}') - return None - - detail_file = f'{workdir}/in_progress/{proj_code}/detail-cfg.json' - if os.path.isfile(detail_file): - with open(detail_file) as f: - detail = json.load(f) - else: - print(f'Error: cfg file missing or not provided - {detail_file}') - return None - - if detail['type'] == 'JSON': - Indexer(proj_code, cfg=cfg, detail=detail, **kwargs).create_refs() - else: - pass - -def get_proj_code(groupdir, pid, groupid): - with open(f'{groupdir}/{groupid}/proj_codes.txt') as f: - proj_code = f.readlines()[int(pid)].strip() - return proj_code - -if __name__ == '__main__': - proj_code = sys.argv[1] - kwargs = rundecode(sys.argv[2:]) - if 'groupid' in kwargs: - proj_code = get_proj_code(kwargs['groupdir'], proj_code, kwargs['groupid']) - - setup_compute(proj_code, **kwargs) \ No newline at end of file From f82398795c42b72338c2a7e470af6e4fdaf5f336 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:36:49 +0000 Subject: [PATCH 12/20] Finished adding all docstrings --- pipeline/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pipeline/utils.py b/pipeline/utils.py index 30baf85..5d39f0b 100644 --- a/pipeline/utils.py +++ b/pipeline/utils.py @@ -212,7 +212,10 @@ def get_proj_file(proj_dir: str, proj_file: str) -> dict | None: :returns: A dictionary of the contents of a json file or None if there are problems. """ - projfile = f'{proj_dir}/{proj_file}' + if not proj_file: + projfile = proj_dir + else: + projfile = f'{proj_dir}/{proj_file}' if os.path.isfile(projfile): try: with open(projfile) as f: @@ -225,7 +228,7 @@ def get_proj_file(proj_dir: str, proj_file: str) -> dict | None: else: return None -def set_proj_file(proj_dir: str, proj_file: str, contents: list, logger: logging.Logger) -> None: +def set_proj_file(proj_dir: str, proj_file: str, contents: dict, logger: logging.Logger) -> None: """ Overwrite the contents of a project file within a project code directory. @@ -233,6 +236,9 @@ def set_proj_file(proj_dir: str, proj_file: str, contents: list, logger: logging :param proj_file: (str) Name of a file to access within the project directory. + :param contents: (dict) Dictionary to write into json format config file within + the project directory. + :returns: A dictionary of the contents of a json file or None if there are problems. """ projfile = f'{proj_dir}/{proj_file}' From 3c9b924a0067caa415cf4e9ad4ce20d953e14e78 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:37:49 +0000 Subject: [PATCH 13/20] Updated compute link: --- docs/source/pipeline-source.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/pipeline-source.rst b/docs/source/pipeline-source.rst index 9bfec8f..7887ed8 100644 --- a/docs/source/pipeline-source.rst +++ b/docs/source/pipeline-source.rst @@ -21,11 +21,9 @@ Compute Module **Serial Processor** -.. automodule:: pipeline.compute.serial_process +.. automodule:: pipeline.compute :members: - - .. autoclass:: Converter - :members: + :show-inheritance: ================= Validation Module From b9261eb09f78599bb710f03d291e67ec5e03a620 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:38:14 +0000 Subject: [PATCH 14/20] Updated compute workflow --- single_run.py | 64 ++++----------------------------------------------- 1 file changed, 4 insertions(+), 60 deletions(-) diff --git a/single_run.py b/single_run.py index f238dc6..70151c3 100644 --- a/single_run.py +++ b/single_run.py @@ -59,69 +59,13 @@ def run_compute(args, logger, fh=None, logid=None, **kwargs) -> None: :params fh: (str) Path to file for logger I/O when defining new logger. - :params logid: (str) Passed to Indexer for specifying a logger component. + :params logid: (str) Passed to KerchunkDSProcessor for specifying a logger component. :returns: None """ - from pipeline.compute.serial_process import Indexer - - logger.info(f'Starting computation step for {args.proj_code}') - - cfg_file = f'{args.proj_dir}/base-cfg.json' - detail_file = f'{args.proj_dir}/detail-cfg.json' - - if not os.path.isfile(cfg_file): - logger.error(f'cfg file missing or not provided - {cfg_file}') - return None - - if not os.path.isfile(detail_file): - logger.error(f'cfg file missing or not provided - {detail_file}') - return None - - version_no = 1 - complete, escape = False, False - while not (complete or escape): - out_json = f'{args.proj_dir}/kerchunk-{version_no}a.json' - out_parq = f'{args.proj_dir}/kerchunk-{version_no}a.parq' - - if os.path.isfile(out_json) or os.path.isfile(out_parq): - if args.forceful: - complete = True - elif args.new_version: - version_no += 1 - else: - escape = True - else: - complete = True - - concat_msg = '' # CMIP and CCI may be different? - - if complete and not escape: - - t1 = datetime.now() - ds = Indexer(args.proj_code, cfg_file=cfg_file, detail_file=detail_file, - workdir=args.workdir, issave_meta=True, thorough=args.quality, forceful=args.forceful, - verb=args.verbose, mode=args.mode, - version_no=version_no, concat_msg=concat_msg, bypass=args.bypass, groupID=args.groupID, - dryrun=args.dryrun, fh=fh, logid=logid) - ds.create_refs() - - compute_time = (datetime.now()-t1).total_seconds() - - detailfile = f'{args.proj_dir}/detail-cfg.json' - with open(detailfile) as f: - detail = json.load(f) - if 'timings' not in detail: - detail['timings'] = {} - detail['timings']['convert_actual'] = ds.convert_time - detail['timings']['concat_actual'] = ds.concat_time - detail['timings']['compute_actual'] = compute_time - with open(detailfile,'w') as f: - f.write(json.dumps(detail)) - - else: - logger.error('Output file already exists and there is no plan to overwrite') - return None + from pipeline.compute import compute_config + logger.info('Starting compute process') + compute_config(args, fh=fh, logid=logid, **kwargs) def run_validation(args, logger, fh=None, **kwargs) -> None: """ From 2b1aa005be317c4332d658d9946389cae42b8a56 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:38:34 +0000 Subject: [PATCH 15/20] Added test script for eventual pipeline testing --- pipeline/tests.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 pipeline/tests.py diff --git a/pipeline/tests.py b/pipeline/tests.py new file mode 100644 index 0000000..136a757 --- /dev/null +++ b/pipeline/tests.py @@ -0,0 +1,29 @@ +__author__ = "Daniel Westwood" +__contact__ = "daniel.westwood@stfc.ac.uk" +__copyright__ = "Copyright 2023 United Kingdom Research and Innovation" + +class TestArgs: + def __init__(self): + self.blacklist = None + self.reason = None + + self.option = None + self.cleanup = None + self.upgrade = None + self.long = None + + self.jobID = None + self.phase = None + self.repeat_id = 'main' + self.new_id = None + + self.error = '' + self.examine = None + + self.write = None + self.overwrite = 0 + + self.workdir = None + self.groupdir = None + self.verbose = None + self.mode = None \ No newline at end of file From 416197a9ad2aa3ccf788d69bdb2a9c170f88f450 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:48:30 +0000 Subject: [PATCH 16/20] Finished docstring updates --- pipeline/scan.py | 245 +++++++++++++++++------------------------------ 1 file changed, 90 insertions(+), 155 deletions(-) diff --git a/pipeline/scan.py b/pipeline/scan.py index 016ccda..04bb8c4 100644 --- a/pipeline/scan.py +++ b/pipeline/scan.py @@ -18,12 +18,14 @@ import numpy as np from pipeline.logs import init_logger -from pipeline.utils import get_attribute, BypassSwitch +from pipeline.utils import get_attribute, BypassSwitch, get_codes, get_proj_dir, get_proj_file, set_codes from pipeline.errors import * -from pipeline.compute.serial_process import Converter, Indexer +from pipeline.compute import KerchunkConverter, KerchunkDSProcessor -def format_float(value: int, logger): - """Format byte-value with proper units""" +def format_float(value: int, logger) -> str | None: + """ + Format byte-value with proper units + """ logger.debug(f'Formatting value {value} in bytes') if value: unit_index = 0 @@ -35,71 +37,23 @@ def format_float(value: int, logger): else: return None -def safe_format(value: int, fstring: str): - """Attempt to format a string given some fstring template.""" +def safe_format(value: int, fstring: str) -> str: + """Attempt to format a string given some fstring template. + - Handles issues by returning '', usually when value is None initially.""" try: return fstring.format(value=value) except: return '' - -def trial_kerchunk(args, nfile: str, ctype: str, logger): - """Perform Kerchunk reading on specific file""" - logger.info(f'Running Kerchunk reader for {nfile}') - - quickConvert = Converter(logger, bypass_driver=args.bypass.skip_driver) - - kwargs = {} - supported_extensions = ['ncf3','hdf5','tif'] - - usetype = ctype - - logger.debug(f'Attempting conversion for 1 {ctype} extension') - t1 = datetime.now() - tdict = quickConvert.convert_to_zarr(nfile, ctype, **kwargs) - t_len = (datetime.now()-t1).total_seconds() - ext_index = 0 - while not tdict and ext_index < len(supported_extensions)-1: - # Try the other ones - extension = supported_extensions[ext_index] - logger.debug(f'Attempting conversion for {extension} extension') - - if extension != ctype: - t1 = datetime.now() - tdict = quickConvert.convert_to_zarr(nfile, extension, **kwargs) - t_len = (datetime.now()-t1).total_seconds() - usetype = extension - ext_index += 1 - - if not tdict: - logger.error('Scanning failed for all drivers, file type is not Kerchunkable') - raise KerchunkDriverFatalError - else: - logger.info(f'Scan successful with {usetype} driver') - return tdict, usetype, t_len - -def load_from_previous(args, cache_id, logger): - cachefile = f'{args.proj_dir}/cache/{cache_id}.json' - if os.path.isfile(cachefile): - logger.info(f"Found existing cached file {cache_id}.json") - with open(cachefile) as f: - refs = json.load(f) - return refs - else: - return None -def perform_scan(args, testfile: str, ctype: str, logger, savecache=True, cache_id=None, thorough=False): - """Map to kerchunk data and perform calculations on test netcdf file.""" - if cache_id and not thorough: - refs = load_from_previous(args, cache_id, logger) - time = 0 - if not refs: - refs, ctype, time = trial_kerchunk(args, testfile, ctype, logger) - else: - refs, ctype, time = trial_kerchunk(args, testfile, ctype, logger) +def summarise_json(args, count: int, ctype: str, logger) -> tuple: + """ + Open previously written JSON cached files and perform analysis. + """ + refs = get_proj_file(args.proj_dir, f'cache/{count}.json') if not refs: - return None, None, None, None, None + return None, None, None, None - logger.debug('Starting Analysis of references') + logger.debug(f'Starting Analysis of references for {count}') # Perform summations, extract chunk attributes sizes = [] @@ -123,36 +77,45 @@ def perform_scan(args, testfile: str, ctype: str, logger, savecache=True, cache_ chunksize = dict(kdict[chunkkey])['chunks'] vars[var] = chunksize - # Save refs individually within cache. - if savecache: - cachedir = f'{args.proj_dir}/cache' - if not os.path.isdir(cachedir): - os.makedirs(cachedir) - with open(f'{cachedir}/{cache_id}.json','w') as f: - f.write(json.dumps(refs)) + return np.sum(sizes), chunks, vars, ctype - return np.sum(sizes), chunks, vars, ctype, time - -def eval_sizes(files: list): +def eval_sizes(files: list) -> list: """Get a list of file sizes on disk from a list of filepaths""" return [os.stat(files[count]).st_size for count in range(len(files))] -def get_seconds(time_allowed: str): +def get_seconds(time_allowed: str) -> int: """Convert time in MM:SS to seconds""" if not time_allowed: return 10000000000 mins, secs = time_allowed.split(':') return int(secs) + 60*int(mins) -def format_seconds(seconds: int): +def format_seconds(seconds: int) -> str: """Convert time in seconds to MM:SS""" mins = int(seconds/60) + 1 if mins < 10: mins = f'0{mins}' return f'{mins}:00' -def perform_safe_calculations(std_vars: list, cpf: list, volms: list, files: list, times: list, logger): - """Perform all calculations safely to mitigate errors that come through during data collation.""" +def perform_safe_calculations(std_vars: list, cpf: list, volms: list, files: list, logger) -> tuple: + """ + Perform all calculations safely to mitigate errors that arise during data collation. + + :param std_vars: (list) A list of the variables collected, which should be the same across + all input files. + + :param cpf: (list) The chunks per file recorded for each input file. + + :param volms: (list) The total data size recorded for each input file. + + :param files: (list) A list of the paths to each file. + + :param logger: (obj) Logging object for info/debug/error messages. + + :returns: Average values of: chunks per file (cpf), number of variables (num_vars), chunk size (avg_chunk), + spatial resolution of each chunk assuming 2:1 ratio lat/lon (spatial_res), totals of NetCDF and Kerchunk estimate + data amounts, number of files, total number of chunks and the addition percentage. + """ kchunk_const = 167 # Bytes per Kerchunk ref (standard/typical) if std_vars: num_vars = len(std_vars) @@ -179,10 +142,10 @@ def perform_safe_calculations(std_vars: list, cpf: list, volms: list, files: lis spatial_res = None if files and avg_vol: - data_represented = avg_vol*len(files) + netcdf_data = avg_vol*len(files) num_files = len(files) else: - data_represented = None + netcdf_data = None num_files = None if files and avg_cpf: @@ -195,60 +158,68 @@ def perform_safe_calculations(std_vars: list, cpf: list, volms: list, files: lis else: addition = None - if files and len(times) > 0: - estm_time = int(np.mean(times)*len(files)) + if avg_cpf and num_files: + kerchunk_data = avg_cpf * num_files * kchunk_const else: - estm_time = 0 + kerchunk_data = None - return avg_cpf, num_vars, avg_chunk, spatial_res, data_represented, num_files, total_chunks, addition, estm_time + return avg_cpf, num_vars, avg_chunk, spatial_res, netcdf_data, kerchunk_data, num_files, total_chunks, addition -def write_skip(proj_dir, proj_code, logger): +def write_skip(proj_dir: str, proj_code: str, logger) -> None: + """ + Quick function to write a 'skipped' detail file. + """ details = {'skipped':True} with open(f'{proj_dir}/detail-cfg.json','w') as f: f.write(json.dumps(details)) logger.info(f'Skipped scanning - {proj_code}/detail-cfg.json blank file created') -def scan_dataset(args, files: list, logger): +def scan_dataset(args, files: list, logger) -> None: """Main process handler for scanning phase""" proj_code = args.proj_code proj_dir = args.proj_dir + detailfile = f'{proj_dir}/detail-cfg.json' + cfgfile = f'{proj_dir}/base-cfg.json' logger.debug(f'Assessment for {proj_code}') # Set up conditions, skip for small file count < 5 escape, is_varwarn, is_skipwarn = False, False, False - cpf, volms, times = [],[],[] - trial_files = 5 + cpf, volms = [],[] - if len(files) < 5: + if len(files) < 3: write_skip(proj_dir, proj_code, logger) return None - else: - logger.info(f'Identified {len(files)} files for scanning') # Perform scans for sample (max 5) files - count = 0 std_vars = None std_chunks = None ctypes = [] + ctype = None - scanfile = files[0] - if '.' in scanfile: - ctype = f'.{scanfile.split(".")[-1]}' - else: - ctype = 'ncf3' + # Create all files in mini-kerchunk set here. Then try an assessment. + limiter = int(len(files)/20) + limiter = max(2, limiter) + limiter = min(100, limiter) - filecap = min(100,len(files)) - while not escape and len(cpf) < trial_files: - logger.info(f'Attempting scan for file {count+1} (min 5, max 100)') - # Add random file selector here - scanfile = files[count] + logger.info(f'Determined {limiter} files to scan') + + mini_ds = KerchunkDSProcessor( + args.proj_code, + cfg_file=cfgfile, detail_file=detailfile, workdir=args.workdir, + thorough=True, forceful=True, # Always run from scratch forcefully to get best time estimates. + version_no='trial-', verb=args.verbose, logid='0', + groupID=args.groupID, limiter=limiter) + + mini_ds.create_refs() + + logger.info(f'Summarising scan results for {limiter} files') + for count in range(limiter): try: - # Measure time and ensure job will not overrun if it can be prevented. - volume, chunks_per_file, varchunks, ctype, time = perform_scan(args, scanfile, ctype, logger, - savecache=True, cache_id=str(count), - thorough=args.quality) + volume, chunks_per_file, varchunks, ctype = summarise_json(args, count, ctype, logger) vars = sorted(list(varchunks.keys())) + + # Keeping the below options although may be redundant as have already processed the files if not std_vars: std_vars = vars if vars != std_vars: @@ -261,13 +232,9 @@ def scan_dataset(args, files: list, logger): if std_chunks[var] != varchunks[var]: raise ConcatFatalError(var=var, chunk1=std_chunks[var], chunk2=varchunks[var]) - if count == 0 and time > get_seconds(args.time_allowed)/trial_files: - raise ExpectTimeoutError(required=format_seconds(time*5), current=args.time_allowed) - cpf.append(chunks_per_file) volms.append(volume) ctypes.append(ctype) - times.append(time) logger.info(f'Data recorded for file {count+1}') except ExpectTimeoutError as err: @@ -276,28 +243,30 @@ def scan_dataset(args, files: list, logger): raise err except Exception as err: raise err - count += 1 - if count >= filecap: - escape = True - if escape: - raise FilecapExceededError(filecap) - logger.info('Scan complete, compiling outputs') + logger.info('Summary complete, compiling outputs') (avg_cpf, num_vars, avg_chunk, - spatial_res, data_represented, num_files, - total_chunks, addition, estm_time) = perform_safe_calculations(std_vars, cpf, volms, files, times, logger) + spatial_res, netcdf_data, kerchunk_data, num_files, + total_chunks, addition) = perform_safe_calculations(std_vars, cpf, volms, files, logger) c2m = 167 # Memory for each chunk in kerchunk in B details = { - 'netcdf_data' : format_float(data_represented, logger), - 'kerchunk_data' : format_float(avg_cpf * num_files * c2m, logger), + 'netcdf_data' : format_float(netcdf_data, logger), + 'kerchunk_data' : format_float(kerchunk_data, logger), 'num_files' : num_files, 'chunks_per_file' : safe_format(avg_cpf,'{value:.1f}'), 'total_chunks' : safe_format(total_chunks,'{value:.2f}'), 'estm_chunksize' : format_float(avg_chunk,logger), 'estm_spatial_res' : safe_format(spatial_res,'{value:.2f}') + ' deg', - 'estm_time' : format_seconds(estm_time), + 'timings' : { + 'convert_estm' : mini_ds.convert_time, + 'concat_estm' : mini_ds.concat_time, + 'validate_estm' : mini_ds.validate_time, + 'convert_actual' : None, + 'concat_actual' : None, + 'validate_actual': None, + }, 'variable_count' : num_vars, 'addition' : safe_format(addition,'{value:.3f}') + ' %', 'var_err' : is_varwarn, @@ -324,51 +293,17 @@ def scan_dataset(args, files: list, logger): # Replace with dumping dictionary f.write(json.dumps(details)) logger.info(f'Written output file {proj_code}/detail-cfg.json') - logger.info('Performing concatenation attempt with minimal files') - try: - assemble_trial_concatenation(args, ctype, logger) - except Exception as err: - logger.error('Error in concatenating files') - raise err - -def assemble_trial_concatenation(args, ctype, logger): - cfg_file = f'{args.proj_dir}/base-cfg.json' - detail_file = f'{args.proj_dir}/detail-cfg.json' - - idx_trial = Indexer(args.proj_code, cfg_file=cfg_file, detail_file=detail_file, - workdir=args.workdir, issave_meta=True, thorough=False, forceful=args.forceful, - verb=args.verbose, mode=args.mode, - bypass=args.bypass, groupID=args.groupID, limiter=2, ctype=ctype) - - idx_trial.create_refs() - with open(detail_file,'w') as f: - f.write(json.dumps(idx_trial.collect_details())) - logger.debug('Collected new details into detail-cfg.json') - - -def scan_config(args, fh=None, logid=None, **kwargs): +def scan_config(args, fh=None, logid=None, **kwargs) -> None: """Configure scanning and access main section""" logger = init_logger(args.verbose, args.mode, 'scan',fh=fh, logid=logid) logger.debug(f'Setting up scanning process') - - cfg_file = f'{args.proj_dir}/base-cfg.json' - if os.path.isfile(cfg_file): - with open(cfg_file) as f: - cfg = json.load(f) - else: - os.system(f'ls {args.proj_dir}') - logger.error(f'cfg file missing or not provided - {cfg_file}') - return None args.workdir = get_attribute('WORKDIR', args, 'workdir') args.groupdir = get_attribute('GROUPDIR', args, 'groupdir') - if args.groupID: - args.proj_dir = f'{args.workdir}/in_progress/{args.groupID}/{args.proj_code}' - else: - args.proj_dir = f'{args.workdir}/in_progress/{args.proj_code}' + args.proj_dir = get_proj_dir(args.proj_code, args.workdir, args.groupID) logger.debug(f"""Extracted attributes: {args.proj_code}, {args.workdir}, From 0c049ea4bfe6ab6b3ad530d4b623a4d2059032db Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 10:49:06 +0000 Subject: [PATCH 17/20] Partially written validate docstrings --- pipeline/validate.py | 110 ++++++++++++------------------------------- 1 file changed, 31 insertions(+), 79 deletions(-) diff --git a/pipeline/validate.py b/pipeline/validate.py index a29b369..e4838d2 100644 --- a/pipeline/validate.py +++ b/pipeline/validate.py @@ -16,12 +16,13 @@ from pipeline.errors import * from pipeline.logs import init_logger, SUFFIXES, SUFFIX_LIST -from pipeline.utils import BypassSwitch +from pipeline.utils import BypassSwitch, open_kerchunk from ujson import JSONDecodeError +from dask.distributed import LocalCluster ## 1. Array Selection Tools -def find_dimensions(dimlen: int, divisions: int): +def find_dimensions(dimlen: int, divisions: int) -> int: """Determine index of slice end position given length of dimension and fraction to assess""" # Round down then add 1 slicemax = int(dimlen/divisions)+1 @@ -41,9 +42,9 @@ def get_vslice(shape: list, dtypes: list, lengths: list, divisions: list, logger logger.debug(f'Slice {vslice}') return vslice -def get_concat_dims(xfiles, detailfile=None): - # Not usable with virtual dimensions - concat_dims = {'time':0} +def get_concat_dims(xobjs, detailfile=None): + """Retrieve the sizes of the concatenation dims""" + concat_dims={} if os.path.isfile(detailfile): with open(detailfile) as f: details = json.load(f) @@ -52,9 +53,7 @@ def get_concat_dims(xfiles, detailfile=None): for dim in details['concat_dims']: concat_dims[dim] = 0 - for xf in xfiles: - # Open netcdf in lowest memory intensive way possible. - ds = xr.open_dataset(xf) + for ds in xobjs: for dim in concat_dims.keys(): concat_dims[dim] += ds[dim].shape[0] return concat_dims @@ -99,7 +98,7 @@ def pick_index(nfiles: list, indexes: list): indexes.append(index) return indexes -def locate_kerchunk(args, logger, get_str=False): +def locate_kerchunk(args, logger, get_str=False, remote_protocol='https'): """Gets the name of the latest kerchunk file for this project code""" files = os.listdir(args.proj_dir) # Get filename only kfiles = [] @@ -122,7 +121,7 @@ def locate_kerchunk(args, logger, get_str=False): if get_str: return kfile, False else: - return open_kerchunk(kfile, logger, remote_protocol='https'), False + return open_kerchunk(kfile, logger, remote_protocol=remote_protocol), False elif check_complete: if not args.forceful: logger.error('File already exists and no override is set') @@ -149,44 +148,6 @@ def locate_kerchunk(args, logger, get_str=False): logger.error(f'No Kerchunk file located at {args.proj_dir} and no in-place validation indicated - exiting') raise MissingKerchunkError -def open_kerchunk(kfile: str, logger, isparq=False, remote_protocol='file'): - """Open kerchunk file from JSON/parquet formats""" - if isparq: - logger.debug('Opening Kerchunk Parquet store') - from fsspec.implementations.reference import ReferenceFileSystem - fs = ReferenceFileSystem( - kfile, - remote_protocol='file', - target_protocol="file", - lazy=True) - return xr.open_dataset( - fs.get_mapper(), - engine="zarr", - backend_kwargs={"consolidated": False, "decode_times": False} - ) - else: - logger.debug('Opening Kerchunk JSON file') - try: - mapper = fsspec.get_mapper('reference://',fo=kfile, target_options={"compression":None}, remote_protocol=remote_protocol) - except JSONDecodeError as err: - logger.error(f"Kerchunk file {kfile} appears to be empty") - raise MissingKerchunkError - # Need a safe repeat here - ds = None - attempts = 0 - while attempts < 3 and not ds: - attempts += 1 - try: - ds = xr.open_zarr(mapper, consolidated=False, decode_times=True) - except OverflowError: - ds = None - except Exception as err: - raise MissingKerchunkError(message=f'Failed to open kerchunk file {kfile}') - if not ds: - raise ChunkDataError - logger.debug('Successfully opened Kerchunk with virtual xarray ds') - return ds - def mem_to_value(mem): """Convert a memory value i.e 2G into a value""" suffix = mem[-1] @@ -199,17 +160,6 @@ def value_to_mem(value): suffix_index += 1 return f'{value:.0f}{SUFFIX_LIST[suffix_index]}' -def check_memory(nfiles, indexes, mem, logger): - logger.info(f'Performing Memory Allowance check for {len(indexes)} files') - memcap = mem_to_value(mem) - nftotal = 0 - for index in indexes: - nftotal += os.path.getsize(nfiles[index]) - - logger.debug(f'Determined memory requirement is {nftotal} - allocated {memcap}') - if nftotal > memcap: - raise ExpectMemoryError(required=value_to_mem(nftotal), current=mem) - def open_netcdfs(args, logger, thorough=False, concat_dims='time'): """Returns a single xarray object with one timestep: - Select a single file and a single timestep from that file @@ -223,22 +173,13 @@ def open_netcdfs(args, logger, thorough=False, concat_dims='time'): thorough = True xobjs = [] if not thorough: - if not args.bypass.skip_memcheck: - check_memory(xfiles, indexes, args.memory, logger) - else: - logger.warning('Memory checks bypassed') - for one, i in enumerate(indexes): + for i in indexes: xobjs.append(xr.open_dataset(xfiles[i])) - if len(xobjs) == 0: logger.error('No valid timestep objects identified') raise NoValidTimeSlicesError(message='Kerchunk', verbose=args.verbose) return xobjs, indexes, xfiles else: - if not args.bypass.skip_memcheck: - check_memory(xfiles, [i for i in range(len(xfiles))], args.memory, logger) - else: - logger.warning('Memory checks bypassed') xobj = xr.concat([xr.open_dataset(fx) for fx in xfiles], dim=concat_dims, data_vars='minimal') return xobj, None, xfiles @@ -286,7 +227,11 @@ def compare_data(vname: str, xbox, kerchunk_box, logger, bypass=False): tolerance = None testpass = True - if not np.array_equal(xbox, kerchunk_box, equal_nan=True): + try: + equality = np.array_equal(xbox, kerchunk_box, equal_nan=True) + except TypeError as err: + equality = np.array_equal(xbox, kerchunk_box) + if not equality: logger.warning(f'Failed equality check for {vname}') raise ValidationError try: @@ -324,6 +269,10 @@ def compare_data(vname: str, xbox, kerchunk_box, logger, bypass=False): raise ValidationError def validate_shape_to_tolerance(nfiles: int, xv, dims, xshape, kshape, logger, detailfile=None): + """Special case function for validating a shaped array to some tolerance + - Alternative to opening N files, only works if each file has roughly the same total shape. + - Tolerance is based on the number of files supplied, more files means the tolerance is lower? + """ tolerance = 1/(nfiles*5) logger.info(f'Attempting shape bypass using concat-dim tolerance {tolerance*100}%') try: @@ -621,6 +570,9 @@ def validate_dataset(args, fh=None, logid=None, **kwargs): logger = init_logger(args.verbose, args.mode,'validate', fh=fh, logid=logid) logger.info(f'Starting tests for {args.proj_code}') + # Experimenting with a local dask cluster for memory limit + cluster = LocalCluster(n_workers=1, threads_per_worker=1, memory_target_fraction=0.95, memory_limit=str(args.memory + 'B')) + if hasattr(args, 'backtrack'): if args.backtrack: run_backtrack(args, logger) @@ -663,15 +615,15 @@ def validate_dataset(args, fh=None, logid=None, **kwargs): else: ## Set up loop variables fullset = bool(args.quality) - concat_dims = get_concat_dims(xfiles, detailfile=f'{args.proj_dir}/detail-cfg.json') - - logger.info(f"Attempting file subset validation: {len(indexes)}/{nfiles}") - for step, index in enumerate(indexes): - xobj = xobjs[step] - logger.info(f'Running tests for selected file: {index} ({step+1}/{len(indexes)})') - fullset = attempt_timestep(args, xobj, kobj, step, nfiles, logger, concat_dims=concat_dims) - if fullset: - break + concat_dims = get_concat_dims(xobjs, detailfile=f'{args.proj_dir}/detail-cfg.json') + if not fullset: + logger.info(f"Attempting file subset validation: {len(indexes)}/{nfiles}") + for step, index in enumerate(indexes): + xobj = xobjs[step] + logger.info(f'Running tests for selected file: {index} ({step+1}/{len(indexes)})') + fullset = attempt_timestep(args, xobj, kobj, step, nfiles, logger, concat_dims=concat_dims) + if fullset: + break if fullset: print() From 04b501b12e0585e2eefed9368f2be1d495263df8 Mon Sep 17 00:00:00 2001 From: dwest77a Date: Thu, 28 Mar 2024 11:09:23 +0000 Subject: [PATCH 18/20] Fixed some documentation issues --- docs/requirements.txt | 5 ++++- pipeline/scan.py | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 4e46ae4..d1c1e3b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,4 +4,7 @@ h5py==3.9.0 xarray==2023.1.0 kerchunk==0.2.0 numpy==1.24.4 -scipy==1.10.1 \ No newline at end of file +scipy==1.10.1 +dask==2024.3.1 +binpacking==1.5.2 +distributed==2024.3.1 \ No newline at end of file diff --git a/pipeline/scan.py b/pipeline/scan.py index 04bb8c4..61004a9 100644 --- a/pipeline/scan.py +++ b/pipeline/scan.py @@ -112,9 +112,9 @@ def perform_safe_calculations(std_vars: list, cpf: list, volms: list, files: lis :param logger: (obj) Logging object for info/debug/error messages. - :returns: Average values of: chunks per file (cpf), number of variables (num_vars), chunk size (avg_chunk), - spatial resolution of each chunk assuming 2:1 ratio lat/lon (spatial_res), totals of NetCDF and Kerchunk estimate - data amounts, number of files, total number of chunks and the addition percentage. + :returns: Average values of: chunks per file (cpf), number of variables (num_vars), chunk size (avg_chunk), + spatial resolution of each chunk assuming 2:1 ratio lat/lon (spatial_res), totals of NetCDF and Kerchunk estimate + data amounts, number of files, total number of chunks and the addition percentage. """ kchunk_const = 167 # Bytes per Kerchunk ref (standard/typical) if std_vars: From 7df1532a4c26f4dadd81706fca6d3118cb2f61d0 Mon Sep 17 00:00:00 2001 From: dwest77 Date: Thu, 28 Mar 2024 11:56:47 +0000 Subject: [PATCH 19/20] Minor changes to enable simplescan demo --- pipeline/compute.py | 95 ++++---- pipeline/scan.py | 27 ++- pipeline/utils.py | 8 + showcase/notebooks/KerchunkSimpleScan.ipynb | 245 ++++++++++++++++++++ 4 files changed, 321 insertions(+), 54 deletions(-) create mode 100644 showcase/notebooks/KerchunkSimpleScan.ipynb diff --git a/pipeline/compute.py b/pipeline/compute.py index 7c5d0cd..8e4a686 100644 --- a/pipeline/compute.py +++ b/pipeline/compute.py @@ -23,13 +23,23 @@ class KerchunkConverter: """Class for converting a single file to a Kerchunk reference object""" - def __init__(self, clogger, bypass_driver=False, ctype=None) -> None: + def __init__(self, clogger=None, bypass_driver=False, ctype=None, verbose=1) -> None: + if not clogger: + clogger = init_logger(verbose,0,'convert-trial') + self.logger = clogger self.ctype = ctype self.success = True self.bypass_driver = bypass_driver self.loaded_refs = False + self.drivers = { + 'ncf3': self.ncf3_to_zarr, + 'hdf5': self.hdf5_to_zarr, + 'tif' : self.tiff_to_zarr, + 'grib': self.grib_to_zarr, + } + def convert_to_zarr(self, nfile: str, extension=False, **kwargs) -> None: """ Perform conversion to zarr with exceptions for bypassing driver errors. @@ -46,17 +56,12 @@ def convert_to_zarr(self, nfile: str, extension=False, **kwargs) -> None: if the driver is unsuccessful. Errors will be bypassed if the bypass_driver option is selected for this class. """ - drivers = { - 'ncf3': self.ncf3_to_zarr, - 'hdf5': self.hdf5_to_zarr, - 'tif' : self.tiff_to_zarr, - 'grib': self.grib_to_zarr, - } + if extension: self.ctype=extension try: - if self.ctype in drivers: - ref = drivers[self.ctype](nfile, **kwargs) + if self.ctype in self.drivers: + ref = self.drivers[self.ctype](nfile, **kwargs) return ref else: self.logger.debug(f'Extension {self.ctype} not valid') @@ -67,6 +72,41 @@ def convert_to_zarr(self, nfile: str, extension=False, **kwargs) -> None: else: raise err + def try_all_drivers(self, nfile: str, **kwargs) -> dict | None: + """ + Safe creation allows for known issues and tries multiple drivers + + :returns: dictionary of Kerchunk references if successful, raises error + otherwise if unsuccessful. + """ + + extension = False + supported_extensions = list(self.drivers.keys()) + + self.logger.debug(f'Attempting conversion for 1 {self.ctype} extension') + + if not self.ctype: + self.ctype = supported_extensions[0] + + tdict = self.convert_to_zarr(nfile, **kwargs) + ext_index = 0 + while not tdict and ext_index < len(supported_extensions)-1: + # Try the other ones + extension = supported_extensions[ext_index] + self.logger.debug(f'Attempting conversion for {extension} extension') + if extension != self.ctype: + tdict = self.convert_to_zarr(nfile, extension, **kwargs) + ext_index += 1 + + if not tdict: + self.logger.error('Scanning failed for all drivers, file type is not Kerchunkable') + raise KerchunkDriverFatalError + else: + if extension: + self.ctype = extension + self.logger.debug(f'Scan successful with {self.ctype} driver') + return tdict + def save_individual_ref(self, ref: dict, cache_ref: str, forceful=False) -> None: """ Save each individual set of refs created for each file immediately to reduce @@ -162,7 +202,7 @@ def __init__(self, """ if not logger: logger = init_logger(verb, mode, 'compute-serial', fh=fh, logid=logid) - super().__init__(logger, bypass_driver=bypass.skip_driver, ctype=ctype) + super().__init__(caselogger=logger, bypass_driver=bypass.skip_driver, ctype=ctype) self.logger.debug('Starting variable definitions') @@ -663,41 +703,6 @@ def save_metadata(self,zattrs: dict) -> dict: self.logger.debug('Saved global attribute cache') else: self.logger.debug('Skipped saving global attribute cache') - - def try_all_drivers(self, nfile: str, **kwargs) -> dict | None: - """ - Safe creation allows for known issues and tries multiple drivers - - :returns: dictionary of Kerchunk references if successful, raises error - otherwise if unsuccessful. - """ - - extension = False - supported_extensions = ['ncf3','hdf5','tif'] - - self.logger.debug(f'Attempting conversion for 1 {self.ctype} extension') - - if not self.ctype: - self.ctype = supported_extensions[0] - - tdict = self.convert_to_zarr(nfile, **kwargs) - ext_index = 0 - while not tdict and ext_index < len(supported_extensions)-1: - # Try the other ones - extension = supported_extensions[ext_index] - self.logger.debug(f'Attempting conversion for {extension} extension') - if extension != self.ctype: - tdict = self.convert_to_zarr(nfile, extension, **kwargs) - ext_index += 1 - - if not tdict: - self.logger.error('Scanning failed for all drivers, file type is not Kerchunkable') - raise KerchunkDriverFatalError - else: - if extension: - self.ctype = extension - self.logger.debug(f'Scan successful with {self.ctype} driver') - return tdict def load_temp_zattrs(self) -> dict: """ diff --git a/pipeline/scan.py b/pipeline/scan.py index 61004a9..53168b5 100644 --- a/pipeline/scan.py +++ b/pipeline/scan.py @@ -16,8 +16,9 @@ import math import json import numpy as np +import re -from pipeline.logs import init_logger +from pipeline.logs import init_logger, FalseLogger from pipeline.utils import get_attribute, BypassSwitch, get_codes, get_proj_dir, get_proj_file, set_codes from pipeline.errors import * from pipeline.compute import KerchunkConverter, KerchunkDSProcessor @@ -45,29 +46,37 @@ def safe_format(value: int, fstring: str) -> str: except: return '' -def summarise_json(args, count: int, ctype: str, logger) -> tuple: +def summarise_json(identifier, ctype: str, logger=None, proj_dir=None) -> tuple: """ Open previously written JSON cached files and perform analysis. """ - refs = get_proj_file(args.proj_dir, f'cache/{count}.json') + if not logger: + logger = FalseLogger() + + if type(identifier) == dict: + # Assume refs passed directly. + refs = identifier + else: + if proj_dir: + refs = get_proj_file(proj_dir, f'cache/{identifier}.json') + logger.debug(f'Starting Analysis of references for {identifier}') + if not refs: return None, None, None, None - logger.debug(f'Starting Analysis of references for {count}') - # Perform summations, extract chunk attributes sizes = [] vars = {} chunks = 0 kdict = refs['refs'] for chunkkey in kdict.keys(): - if len(kdict[chunkkey]) >= 2: + if bool(re.search(r'\d', chunkkey)): try: sizes.append(int(kdict[chunkkey][2])) - chunks += 1 except ValueError: pass - if '/.zarray' in chunkkey: + chunks += 1 + elif '/.zarray' in chunkkey: var = chunkkey.split('/')[0] chunksize = 0 if var not in vars: @@ -216,7 +225,7 @@ def scan_dataset(args, files: list, logger) -> None: logger.info(f'Summarising scan results for {limiter} files') for count in range(limiter): try: - volume, chunks_per_file, varchunks, ctype = summarise_json(args, count, ctype, logger) + volume, chunks_per_file, varchunks, ctype = summarise_json(count, ctype, logger=logger,proj_dir=args.proj_dir) vars = sorted(list(varchunks.keys())) # Keeping the below options although may be redundant as have already processed the files diff --git a/pipeline/utils.py b/pipeline/utils.py index 5d39f0b..053f6bf 100644 --- a/pipeline/utils.py +++ b/pipeline/utils.py @@ -260,3 +260,11 @@ def get_proj_dir(proj_code: str, workdir: str, groupID: str) -> str: return f'{workdir}/in_progress/{groupID}/{proj_code}' else: return f'{workdir}/in_progress/{proj_code}' + +def find_zarrays(refs: dict) -> dict: + """Quick way of extracting all the zarray components of a ref set.""" + zarrays = {} + for r in refs['refs'].keys(): + if '.zarray' in r: + zarrays[r] = refs['refs'][r] + return zarrays \ No newline at end of file diff --git a/showcase/notebooks/KerchunkSimpleScan.ipynb b/showcase/notebooks/KerchunkSimpleScan.ipynb new file mode 100644 index 0000000..f4aeddc --- /dev/null +++ b/showcase/notebooks/KerchunkSimpleScan.ipynb @@ -0,0 +1,245 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "73dddd62-e4c4-4cd0-9422-d6ebfcd42393", + "metadata": {}, + "source": [ + "# Assess a file for Kerchunkability with Padocc pipeline\n", + "Take an accepted input (single) file and attempt to convert using known methods." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "82807227-e389-46be-b872-431b579180e4", + "metadata": {}, + "outputs": [], + "source": [ + "from pipeline.compute import KerchunkConverter\n", + "from pipeline.scan import summarise_json\n", + "\n", + "nfile = '/badc/cmip6/data/CMIP6/C4MIP/CCCma/CanESM5/1pctCO2-rad/r1i1p1f1/AERmon/ps/gn/v20190429/ps_AERmon_CanESM5_1pctCO2-rad_r1i1p1f1_gn_185001-200012.nc'\n", + "\n", + "converter = KerchunkConverter(bypass_driver=True)\n", + "refs = converter.try_all_drivers(nfile)" + ] + }, + { + "cell_type": "markdown", + "id": "3aef3161-54ed-4744-9cfa-f2f7988aa81e", + "metadata": {}, + "source": [ + "The kerchunk converter will fail for invalid types for Kerchunking, some solutions may be possible but it may be that your files must instead be converted to Zarr.\n", + "We can otherwise assess the refs generated here." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "00213a67-cded-4e33-b3d6-6e3f0ce9396f", + "metadata": {}, + "outputs": [], + "source": [ + "volume, cpf, varchunks, ctype = summarise_json(refs, converter.ctype)" + ] + }, + { + "cell_type": "markdown", + "id": "cbad3eee-0efd-4b59-8054-98a0524a4a04", + "metadata": {}, + "source": [ + "Summarised outputs below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "31873abe-5403-4896-a9b4-f1951deb3cc9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Assessment for selected file:\n", + " Size in bytes : 47520998\n", + " Chunks in file : 5440\n", + " Variables : ['lat', 'lat_bnds', 'lon', 'lon_bnds', 'ps', 'time', 'time_bnds']\n", + " Kerchunk Driver Type : hdf5\n", + "\n" + ] + } + ], + "source": [ + "print(f\"\"\"\n", + "Assessment for selected file:\n", + " Size in bytes : {volume}\n", + " Chunks in file : {cpf}\n", + " Variables : {list(varchunks.keys())}\n", + " Kerchunk Driver Type : {ctype}\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "6467829a-e682-4c92-bd2e-261754d6a88b", + "metadata": {}, + "source": [ + "`varchunks` contains the chunk size for each variable. Chunk size is N-dimensional, either the whole array for a specific variable or some subsection." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "00183712-ad63-4590-ad06-0955935eee71", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lat [64]\n", + "lat_bnds [64, 2]\n", + "lon [128]\n", + "lon_bnds [128, 2]\n", + "ps [1, 64, 128]\n", + "time [1]\n", + "time_bnds [1, 2]\n" + ] + } + ], + "source": [ + "for var in varchunks.keys():\n", + " print(var, varchunks[var])" + ] + }, + { + "cell_type": "markdown", + "id": "64b3a3fd-17fc-49e8-98f2-5b715321b319", + "metadata": {}, + "source": [ + "From this we can see for example the `lat` dimension is chunked in sets of 64 values." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3cc50908-d97f-4267-8bb2-dbb261aa645a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\"chunks\":[64],\"compressor\":null,\"dtype\":\" Date: Thu, 28 Mar 2024 11:59:28 +0000 Subject: [PATCH 20/20] Updated dask version for docs --- docs/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index d1c1e3b..8f387c3 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,6 +5,6 @@ xarray==2023.1.0 kerchunk==0.2.0 numpy==1.24.4 scipy==1.10.1 -dask==2024.3.1 +dask==2023.3.1 binpacking==1.5.2 -distributed==2024.3.1 \ No newline at end of file +distributed==2023.3.1 \ No newline at end of file