From 2496124f6db0c31e41c4968b4357ce2923fd6038 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 6 Mar 2025 13:32:19 +1300 Subject: [PATCH 1/3] Improve docstrings --- rules/config.smk | 82 ++++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 31 deletions(-) diff --git a/rules/config.smk b/rules/config.smk index 7d266a5..5b2c8b0 100644 --- a/rules/config.smk +++ b/rules/config.smk @@ -69,35 +69,50 @@ def is_scalar(x): def resolve_config_value(*rule_parts, sep="/"): """ - Resolve a config value defined by the *rule_parts* of the config, e.g. - rule_parts = ['filter', 'min_length'] then we expect a scalar or a - dictionary to be present at config['filter']['min_length']. If the config - value is to be used as a path please use the `resolve_config_path` function - instead. - - If a scalar then that value is returned, i.e. it's always the same no matter - what the wildcards are. Note that this scalar may have wildcards in it which will - be filled in by Snakemake when the relevant rule is evaluated. - - If a dictionary we search it for the relevant value by finding the closest matching - key once wildcards have been considered. For instance in a three-tiered wildcard pipeline - such as this with subtype, segment and time we interpret these as ordered in specificity. - For instance, if only one wildcard value is specified in the config (the others are '*') - then matching subtype is more specific than segment. Given example - wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of: - - 'h5nx/pb2/2y' ─ all 3 wildcard values specified - - 'h5nx/pb2/*' ┐ - - 'h5nx/*/2y' ├ 2/3 wildcard values specified - - '*/pb2/2y' ┘ - - 'h5nx/*/*' ┐ - - '*/pb2/*' ├ 1/3 wildcard values specified - - '*/*/2y' ┘ - - '*/*/*' ─ default / fall-back - and the first key present in the config is used. + A helper function intended to be used as directly as a Snakemake Input + function to resolve the appropriate config value. - Examples: + Given an array of config *rule_parts* (keys), we return a function with a + single argument *wildcards* which resolves to the appropriate config value. + For instance the config value(s) for `config['filter']['min_length']` would + be accessed via + + # within a Snakemake rule: params: - correction = resolve_config_value(['traits', 'sampling_bias_correction']) + min_length = resolve_config_value('filter', 'min_length') + + # within a python function: + min_length = resolve_config_value('filter', 'min_length')(wildcards) + + The underlying config value may be structured in two ways: + + 1. A scalar, e.g. `config['filter']['min_length'] = some_scalar`. In + this case the scalar value is simply returned, i.e. it's always the same no + matter what the wildcards are. + + 2. A dictionary, e.g. `config['filter']['min_length'] = {...}`), which + allows the resolved value to vary according to the wildcards. We search + the dictionary for the relevant value by finding the closest matching key + once wildcards have been considered. For instance in a three-tiered + wildcard pipeline such as this with subtype, segment and time we + interpret these as ordered in specificity. For instance, if only one + wildcard value is specified in the config (the others are '*') then + matching subtype is more specific than segment. Given example wildcard + values of {subtype=h5nx, segment=pb2, time=2y} then we have a search + order of: + - 'h5nx/pb2/2y' ─ all 3 wildcard values specified + - 'h5nx/pb2/*' ┐ + - 'h5nx/*/2y' ├ 2/3 wildcard values specified + - '*/pb2/2y' ┘ + - 'h5nx/*/*' ┐ + - '*/pb2/*' ├ 1/3 wildcard values specified + - '*/*/2y' ┘ + - '*/*/*' ─ default / fall-back + and the first key present in the config is used. + + Note that in both cases, the resolved value may be a string with wildcard + placeholders in it which may (separately) be filled in by Snakemake or a + call to `format` or `expand`. """ try: config_lookup = config @@ -130,12 +145,13 @@ def resolve_config_value(*rule_parts, sep="/"): msg += f'\n\tThe dictionary is missing a matching key for the current target of {search_keys[0]!r}, or a fuzzy match (i.e. using "*" placeholders)' msg += '\n\tP.S. If you want to use a single value across all builds then set a scalar value (number, string, boolean)' raise InvalidConfigError(msg) + return resolve def resolve_config_path(*fields): """ - A helper function intended to be used as directly as a value within - snakemake input/params blocks. + A helper function intended to be used as directly as a Snakemake Input + function to resolve the appropriate config path. Given an array of config *fields* (keys), we return a function with a single argument *wildcards* which resolves to a appropriate local file path @@ -144,9 +160,13 @@ def resolve_config_path(*fields): of those functions for more details. Examples: + # within a Snakemake rule input: - colors = resolve_config_path(['colors', 'hardcoded']), - lat_longs = resolve_config_path(["lat_longs"]), + colors = resolve_config_path('colors', 'hardcoded'), + lat_longs = resolve_config_path("lat_longs"), + + # within a python function + colors = resolve_config_path('colors', 'hardcoded')(wildcards) """ assert all([isinstance(f,str) for f in fields]), \ "Arguments to `resolve_config_path` must be strings" From 77796cc7a4eaa078367b1737353ac85757b471bd Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 6 Mar 2025 13:32:52 +1300 Subject: [PATCH 2/3] remove unused (old) functions --- rules/config.smk | 64 ------------------------------------------------ 1 file changed, 64 deletions(-) diff --git a/rules/config.smk b/rules/config.smk index 5b2c8b0..38cc55e 100644 --- a/rules/config.smk +++ b/rules/config.smk @@ -247,67 +247,3 @@ def expand_target_patterns(): targets.append(target) return targets - - -# REMOVE BELOW LINE # ------------------------------------------------------------------------------------------------ - - -def old_resolve_config_value_old(*rule_parts, fallback="FALLBACK"): - """ - Note that the underlying algorithm for finding the config value, - and the config syntax itself, is going to be significantly - modified in - - Given an array of config *fields* (keys), we return a function with a single - argument *wildcards* which resolves to the config value (any type). If the - config value is to be used as a path please use the `resolve_config_path` - function instead. - - Examples: - params: - correction = resolve_config_value(['traits', 'sampling_bias_correction']) - """ - def resolve(wildcards): - rule_name = rule_parts[0] - assert rule_name in config, f"Config missing top-level {rule_name} key" - if len(rule_parts)==1: - # Ths form of config syntax cannot use the dictionary-based wildcard-dependent syntax - # As per the docstring, this will be redone in - assert isinstance(config[rule_name], str), f"Invalid config spec for config.{rule_name}" - return config[rule_name] - rule_key = rule_parts[1] - assert rule_key in config[rule_name], f"Config missing entry for {rule_name}.{rule_key}" - try: - return config[rule_name][rule_key][wildcards.subtype][wildcards.time] - except KeyError: - assert fallback in config[rule_name][rule_key], f"config.{rule_name!r}.{rule_key!r} either needs " \ - f"an entry for {wildcards.subtype!r}.{wildcards.time!r} added or a (default) {fallback!r} key." - return config[rule_name][rule_key][fallback] - return resolve - -def old_resolve_config_path_old(*fields): - """ - A helper function intended to be used as directly as a value within - snakemake input/params blocks. - - Given an array of config *fields* (keys), we return a function with a single - argument *wildcards* which resolves to a appropriate local file path - (string). These are accomplished via calls to helper functions - `resolve_config_value` and `resolve_path`, respectively; see the docstrings - of those functions for more details. - - Examples: - input: - colors = old_resolve_config_path_old(['colors', 'hardcoded']), - lat_longs = old_resolve_config_path_old(["lat_longs"]), - """ - assert all([isinstance(f,str) for f in fields]), \ - "Arguments to `resolve_config_path` must be strings" - - def resolve(wildcards): - raw_value = old_resolve_config_value_old(*fields)(wildcards) - if not raw_value: # falsey -> don't resolve to a path! - return "" - return resolve_path(raw_value, wildcards) - - return resolve \ No newline at end of file From 720564780cdad4709cd904a44bb922ec98b7563a Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 6 Mar 2025 13:58:43 +1300 Subject: [PATCH 3/3] Improve how 'subtype_query' is defined Brings the config structure in line with all of the other config values, but this necessitates a more complicated usage of `resolve_config_value`. Hopefully the added comments are instructive to people in the future! This also greatly improves the error handling if we forget to add the appropriate subtype to the config, or leave out 'subtype_query' altogether. Inspired by and --- genome-focused/config.yaml | 11 ++++++----- rules/main.smk | 7 +++++-- segment-focused/config.yaml | 16 +++++++++------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/genome-focused/config.yaml b/genome-focused/config.yaml index 69fc46d..e334cee 100644 --- a/genome-focused/config.yaml +++ b/genome-focused/config.yaml @@ -45,12 +45,13 @@ dropped_strains: config/{subtype}/dropped_strains_{subtype}.txt clades_file: clade-labeling/h5n1-clades.tsv # use H5N1 clades description: config/{subtype}/description.md -## Subtype query - this structure is different from all other config parameters, requiring -# a key for each of the subtypes defined above in 'builds'. The string will be supplied to -# augur filter's --query argument. +## Subtype query - note that you cannot vary this based on segment/time wildcards +# as the filtering to subtype is independent (upstream) of those wildcards. +# Note also that you aren't limited to the "subtype" metadata field - any valid +# `augur filter --query` argument is ok. subtype_query: - "h5n1-cattle-outbreak": "genoflu in 'B3.13'" - "h5n1-d1.1": "genoflu in 'D1.1'" + "h5n1-cattle-outbreak/*/*": "genoflu in 'B3.13'" + "h5n1-d1.1/*/*": "genoflu in 'D1.1'" #### Rule-specific parameters #### # The formatting here represents the three-tiered nature of the avian-flu build which diff --git a/rules/main.smk b/rules/main.smk index cb0173b..5347e7e 100755 --- a/rules/main.smk +++ b/rules/main.smk @@ -183,7 +183,9 @@ rule filter_sequences_by_subtype: output: sequences = "results/{subtype}/{segment}/sequences.fasta", params: - subtypes=lambda w: config['subtype_query'][w.subtype], + # We don't have all wildcards set here (too early!) so we need to manually specify them for `resolve_config_value` + # (Note that we do have w.segment set, but we deliberately don't use it as the query must not vary by segment) + subtypes = lambda w: resolve_config_value('subtype_query')({'subtype': w.subtype, 'segment': '*', 'time': '*'}) shell: """ augur filter \ @@ -199,7 +201,8 @@ rule filter_metadata_by_subtype: output: metadata = "results/{subtype}/metadata.tsv", params: - subtypes= lambda w: config['subtype_query'][w.subtype], + # We don't have all wildcards set here (too early!) so we need to manually specify them for `resolve_config_value` + subtypes = lambda w: resolve_config_value('subtype_query')({'subtype': w.subtype, 'segment': '*', 'time': '*'}) shell: """ augur filter \ diff --git a/segment-focused/config.yaml b/segment-focused/config.yaml index eed6b83..4cc0390 100644 --- a/segment-focused/config.yaml +++ b/segment-focused/config.yaml @@ -51,14 +51,16 @@ dropped_strains: config/{subtype}/dropped_strains_{subtype}.txt clades_file: clade-labeling/{subtype}-clades.tsv description: config/description_gisaid.md -## Subtype query - this structure is different from all other config parameters, requiring -# a key for each of the subtypes defined above in 'builds'. The string will be supplied to -# augur filter's --query argument. +## Subtype query - note that you cannot vary this based on segment/time wildcards +# as the filtering to subtype is independent (upstream) of those wildcards. +# Note also that you aren't limited to the "subtype" metadata field - any valid +# `augur filter --query` argument is ok. subtype_query: - "h5nx": "subtype in ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9']" - "h5n1": "subtype in ['h5n1']" - "h7n9": "subtype in ['h7n9']" - "h9n2": "subtype in ['h9n2']" + "h5nx/*/*": "subtype in ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9']" + "h5n1/*/*": "subtype in ['h5n1']" + "h7n9/*/*": "subtype in ['h7n9']" + "h9n2/*/*": "subtype in ['h9n2']" + #### Rule-specific parameters #### # The formatting here represents the three-tiered nature of the avian-flu build which