and a bunch of pre-commit

populationgenomics · Jan 10, 2024 · dad420b · dad420b
1 parent f11d597
commit dad420b
Show file tree

Hide file tree

Showing 39 changed files with 187 additions and 183 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,24 +14,30 @@ repos:
       - id: check-added-large-files
 
   - repo: https://github.com/igorshubovych/markdownlint-cli
-    rev: v0.37.0
+    rev: v0.38.0
     hooks:
       - id: markdownlint
         args: ["--config", ".markdownlint.json"]
 
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        name: isort (python)
+
   - repo: https://github.com/ambv/black
-    rev: 22.10.0
+    rev: 23.12.1
     hooks:
       - id: black
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
     # Ruff version.
-    rev: "v0.1.0"
+    rev: v0.1.11
     hooks:
       - id: ruff
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.1
+    rev: v1.8.0
     hooks:
       - id: mypy
         args:

diff --git a/comparison/comparison.py b/comparison/comparison.py
@@ -8,37 +8,37 @@
 """
 # mypy: ignore-errors
 import json
+import logging
 import os
+import re
+import sys
+from argparse import ArgumentParser
 from collections import defaultdict
 from csv import DictReader
 from enum import Enum
-import logging
-import re
-import sys
 from typing import Any
 
-from argparse import ArgumentParser
 from cloudpathlib import AnyPath
 from cyvcf2 import VCFReader
-import hail as hl
 from peddy import Ped
 
+import hail as hl
+
 from cpg_utils.config import get_config
 from cpg_utils.hail_batch import init_batch
 
 from reanalysis.hail_filter_and_label import (
+    CONFLICTING,
+    LOFTEE_HC,
+    PATHOGENIC,
     extract_annotations,
     filter_matrix_by_ac,
     filter_on_quality_flags,
     filter_to_population_rare,
     filter_to_well_normalised,
     green_and_new_from_panelapp,
-    CONFLICTING,
-    LOFTEE_HC,
-    PATHOGENIC,
 )
-
-from reanalysis.utils import read_json_from_path, canonical_contigs_from_vcf
+from reanalysis.utils import canonical_contigs_from_vcf, read_json_from_path
 
 SAMPLE_NUM_RE = re.compile(r'sample_[0-9]+')
 SAMPLE_ALT_TEMPLATE = 'num_alt_alleles_{}'
@@ -167,7 +167,6 @@ def common_format_aip(results_dict: dict[str, Any]) -> CommonDict:
 
     # collect all per-sample results into a separate index
     for sample, variants in results_dict.items():
-
         for var in variants:
             coords = var['var_data']['coords']
             sample_dict[sample].append(
@@ -207,7 +206,6 @@ def common_format_seqr(seqr: str, affected: list[str]) -> CommonDict:
         ]
 
         for entry in seqr_parser:
-
             # get all valid tags
             tags = [
                 Confidence(tag)
@@ -324,7 +322,6 @@ def find_missing(aip_results: CommonDict, seqr_results: CommonDict) -> CommonDic
             )
 
     for sample in common_samples:
-
         # only finds discrepancies, not Matched results - revise
         sample_discrepancies = [
             variant
@@ -388,12 +385,10 @@ def check_in_vcf(vcf_path: str, variants: CommonDict) -> tuple[CommonDict, Commo
     # iterate over all samples, and corresponding lists
     for sample, var_list in variants.items():
         for var in var_list:
-
             # assume missing until confirmed otherwise
             found = False
             normalised_chrom, coordinates = var.get_cyvcf2_pos(vcf_contigs)
             for vcf_var in vcf_handler(coordinates):
-
                 # check position and alleles
                 if (
                     vcf_var.CHROM == normalised_chrom

diff --git a/comparison/comparison_wrapper.py b/comparison/comparison_wrapper.py
@@ -9,26 +9,24 @@
 import logging
 import os
 import sys
-
 from argparse import ArgumentParser
 
 import hailtop.batch as hb
 
+from cpg_utils.config import get_config
 from cpg_utils.git import (
-    prepare_git_job,
     get_git_commit_ref_of_current_repository,
     get_organisation_name_from_current_directory,
     get_repo_name_from_current_directory,
+    prepare_git_job,
 )
 from cpg_utils.hail_batch import (
     authenticate_cloud_credentials_in_job,
     copy_common_env,
     image_path,
-    remote_tmpdir,
     output_path,
+    remote_tmpdir,
 )
-from cpg_utils.config import get_config
-
 
 # local script references
 COMPARISON_SCRIPT = os.path.join(os.path.dirname(__file__), 'comparison.py')

diff --git a/helpers/forbidden_gene_check.py b/helpers/forbidden_gene_check.py
@@ -128,7 +128,6 @@ def find_version(panel_id: int, all_dates: list[str]) -> dict[str, str | None]:
 
         # iterate through all activities on this panel
         for activity in activities:
-
             # cast the activity datetime to day-resolution
             activity_date = datetime.strptime(
                 activity['created'].split('T')[0], '%Y-%m-%d'
@@ -200,7 +199,6 @@ def main(panels: str | None, out_path: str, dates: list[str]):
 
     # check over all dates
     for date in dates:
-
         logging.info(f'Running the date {date}')
 
         date_genes = set()

diff --git a/helpers/prepare_aip_cohort.py b/helpers/prepare_aip_cohort.py
@@ -8,20 +8,21 @@
 - generates the cohort-specific TOML file
 - tweaks for making singleton versions of the given cohort
 """
-# mypy: ignore-errors
-from argparse import ArgumentParser
-from itertools import product
 import json
 import logging
 import os
-import toml
 
-from cpg_utils import to_path, Path
+# mypy: ignore-errors
+from argparse import ArgumentParser
+from itertools import product
+
+import toml
 
+from cpg_utils import Path, to_path
 from metamist.graphql import gql, query
 
-from reanalysis.utils import read_json_from_path, get_cohort_config
 from reanalysis.hpo_panel_match import main as hpo_match
+from reanalysis.utils import get_cohort_config, read_json_from_path
 
 BUCKET_TEMPLATE = 'gs://cpg-{dataset}-test-analysis/reanalysis'
 LOCAL_TEMPLATE = 'inputs/{dataset}'

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,5 +2,29 @@
 line-length = 88
 skip-string-normalization = true
 
+[tool.isort]
+py_version = 311
+profile = "black"
+line_length = 88
+sections = ["FUTURE", "STDLIB", "THIRDPARTY", "HAIL", "CPG", "FIRSTPARTY", "LOCALFOLDER"]
+known_hail = [
+    "hail",
+    "hailtop",
+]
+# Adjust these for each repository, e.g., removing those that should be
+# local rather than CPG. Also fill in extend_skip below if there are any
+# subdirectories that should be ignored.
+known_cpg = [
+    "analysis_runner",
+    "cpg_infra",
+    "cpg_utils",
+    "cpg_workflows",
+    "gnomad",
+    "hail_scripts",
+    "metamist",
+]
+# extend_skip = ["list", "submodules", "etc", here"]
+
 [tool.ruff]
+line-length = 88
 extend-select = ["T201"]
diff --git a/reanalysis/clinvar_by_codon.py b/reanalysis/clinvar_by_codon.py
@@ -10,7 +10,9 @@
 """
 
 import click
+
 import hail as hl
+
 from cpg_utils.hail_batch import init_batch
 
 

diff --git a/reanalysis/clinvar_runner.py b/reanalysis/clinvar_runner.py
@@ -10,16 +10,18 @@
 from os.path import join
 
 import click
-from cpg_utils import to_path, Path
+
+from hailtop.batch.job import Job
+
+from cpg_utils import Path, to_path
 from cpg_utils.config import get_config
 from cpg_utils.hail_batch import (
     authenticate_cloud_credentials_in_job,
     get_batch,
     query_command,
 )
-from hailtop.batch.job import Job
 
-from reanalysis import clinvar_by_codon, summarise_clinvar_entries, seqr_loader
+from reanalysis import clinvar_by_codon, seqr_loader, summarise_clinvar_entries
 from reanalysis.vep_jobs import add_vep_jobs
 
 

diff --git a/reanalysis/data_model.py b/reanalysis/data_model.py
@@ -28,9 +28,10 @@
 from enum import Enum
 from os.path import join
 
-import hail as hl
 from cloudpathlib import AnyPath
 
+import hail as hl
+
 
 class CustomEncoder(json.JSONEncoder):
     """

diff --git a/reanalysis/hail_filter_and_label.py b/reanalysis/hail_filter_and_label.py
@@ -18,19 +18,21 @@
 from argparse import ArgumentParser
 from datetime import datetime
 
+from peddy import Ped
+
 import hail as hl
+
 from cpg_utils import to_path
 from cpg_utils.config import get_config
 from cpg_utils.hail_batch import init_batch, output_path
-from peddy import Ped
 
 from reanalysis.hail_audit import (
-    fields_audit,
-    vep_audit,
     BASE_FIELDS_REQUIRED,
     FIELDS_REQUIRED,
     USELESS_FIELDS,
     VEP_TX_FIELDS_REQUIRED,
+    fields_audit,
+    vep_audit,
 )
 from reanalysis.utils import read_json_from_path
 
@@ -62,7 +64,6 @@ def get_clinvar_table(key: str = 'clinvar_decisions') -> str | None:
 
     clinvar_table = get_config()['workflow'].get(key)
     if clinvar_table is not None:
-
         if to_path(clinvar_table).exists():
             logging.info(f'Using clinvar table {clinvar_table}')
             return clinvar_table

diff --git a/reanalysis/hail_filter_sv.py b/reanalysis/hail_filter_sv.py
@@ -11,15 +11,16 @@
 from argparse import ArgumentParser
 
 import hail as hl
+
 from cpg_utils import to_path
 from cpg_utils.config import get_config
-from cpg_utils.hail_batch import init_batch, genome_build
+from cpg_utils.hail_batch import genome_build, init_batch
 
 from reanalysis.hail_filter_and_label import (
+    MISSING_INT,
+    ONE_INT,
     green_and_new_from_panelapp,
     subselect_mt_to_pedigree,
-    ONE_INT,
-    MISSING_INT,
 )
 from reanalysis.static_values import get_logger
 from reanalysis.utils import read_json_from_path
@@ -190,7 +191,6 @@ def main(
 
 
 if __name__ == '__main__':
-
     # general CLI identical to the small variant version
     parser = ArgumentParser()
     parser.add_argument('--mt', required=True, help='path to input MT')

diff --git a/reanalysis/hpo_panel_match.py b/reanalysis/hpo_panel_match.py
@@ -17,11 +17,12 @@
 
 import networkx
 import requests
+from obonet import read_obo
+
 from cpg_utils import to_path
 from metamist.graphql import gql, query
-from obonet import read_obo
 
-from reanalysis.models import PhenotypeMatchedPanels, ParticipantHPOPanels
+from reanalysis.models import ParticipantHPOPanels, PhenotypeMatchedPanels
 
 HPO_KEY = 'HPO Terms (present)'
 HPO_RE = re.compile(r'HP:[0-9]+')
@@ -81,7 +82,6 @@ def get_panels(endpoint: str = PANELS_ENDPOINT) -> dict[str, set[int]]:
     while True:
         endpoint_data = get_json_response(endpoint)
         for panel in endpoint_data['results']:
-
             # can be split over multiple strings
             relevant_disorders = ' '.join(panel['relevant_disorders'] or [])
             for match in re.findall(HPO_RE, relevant_disorders):

diff --git a/reanalysis/html_builder.py b/reanalysis/html_builder.py
@@ -9,9 +9,10 @@
 
 import jinja2
 import pandas as pd
-from cpg_utils import to_path
 from peddy.peddy import Ped
 
+from cpg_utils import to_path
+
 from reanalysis.models import (
     PanelApp,
     PanelDetail,
@@ -22,9 +23,9 @@
     StructuralVariant,
 )
 from reanalysis.utils import (
-    get_config,
     get_cohort_config,
     get_cohort_seq_type_conf,
+    get_config,
     get_logger,
     read_json_from_path,
 )
@@ -174,7 +175,6 @@ def get_summary_stats(
         ext_label_map: dict = self.ext_labels.copy() if self.ext_labels else {}
 
         for sample in self.samples:
-
             if len(sample.variants) == 0:
                 samples_with_no_variants.append(sample.ext_id)
 
@@ -184,7 +184,6 @@ def get_summary_stats(
 
             # iterate over the list of variants
             for variant in sample.variants:
-
                 var_string = variant.var_data.coordinates.string_format
                 unique_variants['any'].add(var_string)
                 sample_variants['any'].add(var_string)
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,7 +10,9 @@ @@
     """
     import click
     import hail as hl
     from cpg_utils.hail_batch import init_batch
@@ Expand Down @@