From 9c50da666440eec3b8269e40d21749431affee4c Mon Sep 17 00:00:00 2001 From: Matt Myers Date: Mon, 25 Nov 2024 12:34:14 -0500 Subject: [PATCH 1/2] update references to HATCHet to HATCHet2 --- .github/workflows/main.yml | 4 +- CMakeLists.txt | 2 +- README.md | 6 +- cloud/README.md | 34 ++++---- custom/GATK4-CNV/custom-gatk4-cnv.sh | 4 +- custom/GATK4-CNV/demo-gatk4-cnv.sh | 20 ++--- custom/GATK4-CNV/gatk4cnsToBB.py | 4 +- docs/buildDocs.sh | 4 +- docs/source/README.md | 73 ++++++++---------- docs/source/conf.py | 6 +- docs/source/doc_check.md | 4 +- docs/source/doc_combine_counts.md | 4 +- docs/source/doc_compilation.md | 4 +- docs/source/doc_compute_cn.md | 10 +-- docs/source/doc_count_alleles.md | 4 +- docs/source/doc_count_reads.md | 4 +- docs/source/doc_count_reads_fw.md | 4 +- docs/source/doc_download_panel.md | 2 +- docs/source/doc_fullpipeline.md | 38 ++++----- docs/source/doc_genotype_snps.md | 2 +- docs/source/doc_manual_install.md | 20 ++--- docs/source/doc_phase_snps.md | 6 +- docs/source/doc_plot_bins.md | 6 +- docs/source/doc_plot_cn_1d2d.md | 2 +- docs/source/doc_runhatchet.md | 8 +- docs/source/index.rst | 4 +- docs/source/recommendation_clustering.md | 2 +- docs/source/recommendation_datatype.md | 2 +- docs/source/recommendation_inference.md | 28 +++---- docs/source/recommendation_runtime.md | 2 +- examples/demo-WES/demo-wes.sh | 20 ++--- examples/demo-complete/demo-complete.sh | 12 +-- script/README.md | 24 +++--- script/hatchet.ini | 4 +- src/hatchet.egg-info/PKG-INFO | 46 +++++++++++ src/hatchet.egg-info/SOURCES.txt | 94 +++++++++++++++++++++++ src/hatchet.egg-info/dependency_links.txt | 0 src/hatchet.egg-info/entry_points.txt | 2 + src/hatchet.egg-info/not-zip-safe | 0 src/hatchet.egg-info/requires.txt | 26 +++++++ src/hatchet.egg-info/top_level.txt | 1 + src/hatchet/bin/HATCHet.py | 8 +- src/hatchet/utils/check.py | 2 +- src/hatchet/utils/combine_counts.py | 10 +-- src/hatchet/utils/commands.py | 4 +- src/hatchet/utils/count_reads.py | 4 +- src/hatchet/utils/download_panel.py | 3 +- src/hatchet/utils/plot_cn_1d2d.py | 2 +- 48 files changed, 370 insertions(+), 205 deletions(-) create mode 100644 src/hatchet.egg-info/PKG-INFO create mode 100644 src/hatchet.egg-info/SOURCES.txt create mode 100644 src/hatchet.egg-info/dependency_links.txt create mode 100644 src/hatchet.egg-info/entry_points.txt create mode 100644 src/hatchet.egg-info/not-zip-safe create mode 100644 src/hatchet.egg-info/requires.txt create mode 100644 src/hatchet.egg-info/top_level.txt diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2529d192..dec129ca 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,7 +50,7 @@ jobs: echo "GRB_LICENSE_FILE=${GUROBI_HOME}/gurobi.lic" >> $GITHUB_ENV continue-on-error: true - - name: Install HATCHet with dev dependencies + - name: Install HATCHet2 with dev dependencies run: | python -m pip install .[dev] env: @@ -136,7 +136,7 @@ jobs: tar zxvf 1000GP_Phase3.tgz --wildcards *chr22* *sample echo "HATCHET_DOWNLOAD_PANEL_REFPANELDIR=$(pwd)" >> $GITHUB_ENV - - name: HATCHet Check + - name: HATCHet2 Check run: | hatchet check diff --git a/CMakeLists.txt b/CMakeLists.txt index df09b9be..a75cff69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required( VERSION 2.8 ) -project( HATCHet ) +project( HATCHet2 ) set( CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR} ${CMAKE_MODULE_PATH} ) diff --git a/README.md b/README.md index 418b29b9..3be02410 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ![CI](https://github.com/raphael-group/hatchet/workflows/CI/badge.svg) [![codecov](https://codecov.io/gh/raphael-group/hatchet/branch/master/graph/badge.svg)](https://codecov.io/gh/raphael-group/hatchet) -# HATCHet +# HATCHet2 -HATCHet is an algorithm to infer allele and clone-specific copy-number aberrations (CNAs), clone proportions, and whole-genome duplications (WGD) for several tumor clones jointly from multiple bulk-tumor samples of the same patient or from a single bulk-tumor sample. +HATCHet2 is an algorithm to infer allele and clone-specific copy-number aberrations (CNAs), clone proportions, and whole-genome duplications (WGD) for several tumor clones jointly from multiple bulk-tumor samples of the same patient or from a single bulk-tumor sample. -Complete documentation for HATCHet is available at [https://raphael-group.github.io/hatchet/](https://raphael-group.github.io/hatchet/) +Complete documentation for HATCHet2 is available at [https://raphael-group.github.io/hatchet/](https://raphael-group.github.io/hatchet/) diff --git a/cloud/README.md b/cloud/README.md index 1e21ff1c..9652b20e 100644 --- a/cloud/README.md +++ b/cloud/README.md @@ -1,14 +1,14 @@ -# Running HATCHet in the cloud +# Running HATCHet2 in the cloud -HATCHet is a Docerizable application and comes with a Dockerfile for easy deployment. We have also made HATCHet +HATCHet2 is a Docerizable application and comes with a Dockerfile for easy deployment. We have also made HATCHet2 available as a publicly accessible Docker image at the [Google Cloud Container Registry](https://cloud.google.com/container-registry). -This facilitates running HATCHet in the cloud without worrying about downloading large BAM files, and without having to -build and install HATCHet locally. +This facilitates running HATCHet2 in the cloud without worrying about downloading large BAM files, and without having to +build and install HATCHet2 locally. -This README provides details on how to run HATCHet entirely on the [Google Cloud Platform](https://cloud.google.com) (GCP) +This README provides details on how to run HATCHet2 entirely on the [Google Cloud Platform](https://cloud.google.com) (GCP) on large datasets made available at [ISB-CGC](https://isb-cgc.appspot.com/). -## Running HATCHet on ISB-CGC Datasets +## Running HATCHet2 on ISB-CGC Datasets ### Setting up access at ISB-CGC @@ -20,14 +20,14 @@ section and follow the steps to register your Google project with ISB-CGC. Note that your PI will most likely have to grant you access to one or more of these controlled datasets using [dbGap](https://dbgap.ncbi.nlm.nih.gov/). The steps in the walk-throughs and tutorials on the ISB-CGC website will -verify that you do have the appropriate access you will need to programmatically read these datasets in HATCHet. +verify that you do have the appropriate access you will need to programmatically read these datasets in HATCHet2. Also note that access to controlled datasets is typically granted only for 24 hours, so you will have to extend your access period on the ISB-CGC website if it has expired. -### Setting up your environment to run HATCHet on GCP +### Setting up your environment to run HATCHet2 on GCP -You do not need to build or install HATCHet locally, either as a python package or a Docker image. The only pre-requisite +You do not need to build or install HATCHet2 locally, either as a python package or a Docker image. The only pre-requisite is that you have installed the [Google Cloud SDK](https://cloud.google.com/sdk/docs/quickstart). This is most cleanly done by installing all required dependencies inside a new Python 3 Conda environment. @@ -43,7 +43,7 @@ pip install oauth2client dsub ### Logging in to your GCP Account After installing the required dependencies, make sure that you login to your Google account and set up your default -project. These are **one time steps** to make sure that HATCHet is able to correctly talk to your project. +project. These are **one time steps** to make sure that HATCHet2 is able to correctly talk to your project. ``` gcloud auth application-default login @@ -61,7 +61,7 @@ that you linked with your ISB-CGC account. ### Preparing a bucket for output files In the Google project that you used in the steps above, use the following command to create a new bucket where the results -of your HATCHet analysis will be saved: +of your HATCHet2 analysis will be saved: ``` gsutil mb gs://BUCKET_NAME @@ -70,10 +70,10 @@ gsutil mb gs://BUCKET_NAME Replace `BUCKET_NAME` with a globally-unique bucket name. This step can also be performed by logging in to the [Google Cloud Console](https://console.cloud.google.com) and navigating to Home -> Storage -> Browser -> Create Bucket. -### Fine-tuning the HATCHet script +### Fine-tuning the HATCHet2 script -The `_run.sh` script provided with HATCHet is an end-end worflow of HATCHet. This will be familiar to you if you have -run HATCHet locally. You can comment out sections of this script to only run certain parts of HATCHet depending on your +The `_run.sh` script provided with HATCHet2 is an end-end worflow of HATCHet2. This will be familiar to you if you have +run HATCHet2 locally. You can comment out sections of this script to only run certain parts of HATCHet2 depending on your needs, and specify the values of certain flags of the pipeline. The part of the script that you will want to pay attention to is the `Reference Genome` section. Depending on the @@ -83,8 +83,8 @@ or `.fa` file available through `wget`. ### Running the scripts -The `cloud_run.sh` script provided with HATCHet is a single [dsub](https://github.com/DataBiosphere/dsub) command that -will run HATCHet in the cloud. This command leverages the [Google Life Sciences API](https://cloud.google.com/life-sciences/docs/reference/rest) +The `cloud_run.sh` script provided with HATCHet2 is a single [dsub](https://github.com/DataBiosphere/dsub) command that +will run HATCHet2 in the cloud. This command leverages the [Google Life Sciences API](https://cloud.google.com/life-sciences/docs/reference/rest) and internally performs the following series of steps: @@ -116,7 +116,7 @@ dsub \ ``` In the above command, you will want to replace `PROJECT_ID` with your project id, `BUCKET_NAME` with the bucket name that -you created above, `RUN_NAME` with any unique name (no spaces!) that identifies your HATCHet run. In addition: +you created above, `RUN_NAME` with any unique name (no spaces!) that identifies your HATCHet2 run. In addition: - The `NORMALBAM` parameter should be replaced with the `gs://..` path to the matched-normal sample of the patient. diff --git a/custom/GATK4-CNV/custom-gatk4-cnv.sh b/custom/GATK4-CNV/custom-gatk4-cnv.sh index f8db60a7..a3c8231f 100644 --- a/custom/GATK4-CNV/custom-gatk4-cnv.sh +++ b/custom/GATK4-CNV/custom-gatk4-cnv.sh @@ -1,8 +1,8 @@ #!/usr/bin/bash -# This is a custom complete pipeline of HATCHet which considers in input segmented files for one or more samples from the same patient, produced by the GATK4 CNV pipeline. +# This is a custom complete pipeline of HATCHet2 which considers in input segmented files for one or more samples from the same patient, produced by the GATK4 CNV pipeline. -HATCHET_HOME="/path/to/hatchet_home" # Provide the full path to HATCHet's repository +HATCHET_HOME="/path/to/hatchet_home" # Provide the full path to HATCHet2's repository CNVTOBB="${HATCHET_HOME}/custom/GATK4-CNV/gatk4cnsToBB.py" diff --git a/custom/GATK4-CNV/demo-gatk4-cnv.sh b/custom/GATK4-CNV/demo-gatk4-cnv.sh index 00747529..82e30b54 100644 --- a/custom/GATK4-CNV/demo-gatk4-cnv.sh +++ b/custom/GATK4-CNV/demo-gatk4-cnv.sh @@ -1,7 +1,7 @@ # Demo of the custom pipeline for GATK4 CNV data : ex: set ft=markdown ;:<<'```shell' # This line makes this file both a guieded and executable DEMO. The file can be both displayed as a Markdown file, where to read the instructions and descriptions of the demo and results, and a BASH script, which can be directly executed with BASH to execute the demo after setting the first requirements. -The following HATCHet's demo represents a guided example of the custom pipeline designed to start from the data produced by the [GATK4 CNV pipeline](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11147). This custom pipeline considers one or more tumor samples from the same patient which have been segmented through the GATK4 CNV pipeline, such that for each sample a **segmented file** is available. The expected format of each segmented file is first described in the following section. Next, the requirements for this demo are described and the guided demo is detailed across the different steps. +The following HATCHet22's demo represents a guided example of the custom pipeline designed to start from the data produced by the [GATK4 CNV pipeline](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11147). This custom pipeline considers one or more tumor samples from the same patient which have been segmented through the GATK4 CNV pipeline, such that for each sample a **segmented file** is available. The expected format of each segmented file is first described in the following section. Next, the requirements for this demo are described and the guided demo is detailed across the different steps. ## Input format @@ -27,14 +27,14 @@ Two example segmented files in this format for two tumor samples from the same p ## Requirements and set up -The demo requires that HATCHet has been succesfully compiled and all the dependencies are available and functional. As such, the demo requires the user to properly set up the following paths: +The demo requires that HATCHet22 has been succesfully compiled and all the dependencies are available and functional. As such, the demo requires the user to properly set up the following paths: ```shell PY="python3" # This id the full path to the version of PYTHON3 which contains the required modules. When this corresponds to the standard version, the user can keep the given value of `python3` :<<'```shell' # Ignore this line ``` -The following paths are consequently obtained to point to the required components of HATCHet +The following paths are consequently obtained to point to the required components of HATCHet22 ```shell CLUSTERBINS="${PY} -m hatchet cluster-bins" @@ -55,7 +55,7 @@ PS4='[\t]' ## Generating input BB file -The first step of this custom pipeline aims to generate an input BB file for HATCHet starting from the given segmented files; in this case, we consider the two examples included with this demo `sample1.GATK4.CNV.seg` and `sample2.GATK4.CNV.seg`. The corresponding BB file can be easily obtained by using the custom python script [gatk4cnsToBB.py](gatk4cnsToBB.py) included in the custom pipeline. We apply the script by specifiying the two segmented files in a white-sperated list between apices and specifying the names of the samples in the same order with `--samples`. In addition, we consider the default values of the parameters and we run it as follows: +The first step of this custom pipeline aims to generate an input BB file for HATCHet22 starting from the given segmented files; in this case, we consider the two examples included with this demo `sample1.GATK4.CNV.seg` and `sample2.GATK4.CNV.seg`. The corresponding BB file can be easily obtained by using the custom python script [gatk4cnsToBB.py](gatk4cnsToBB.py) included in the custom pipeline. We apply the script by specifiying the two segmented files in a white-sperated list between apices and specifying the names of the samples in the same order with `--samples`. In addition, we consider the default values of the parameters and we run it as follows: ```shell ${GATK4CNSTOBB} "sample1.GATK4.CNV.seg sample2.GATK4.CNV.seg" --samples "Sample1 Sample2" > samples.GATK4.CNV.bb @@ -66,14 +66,14 @@ In addition, one could consider different size of the resulting bins by using th ## Global custering -Having the input BB file, we can continue by executing the standard HATCHet pipeline and skipping the pre-processing steps (`count-reads`, `count-alleles`, and `combine-counts`). As such, the next main step of the demo performs the global clustering of HATCHet where genomic bins which have the same copy-number state in every tumor clone are clustered correspondingly. To do this, we use `cluster-bins`, i.e. the HATCHet's component designed for this purpose. At first, we attempt to run the clustering using the default values of the parameters as follows: +Having the input BB file, we can continue by executing the standard HATCHet22 pipeline and skipping the pre-processing steps (`count-reads`, `count-alleles`, and `combine-counts`). As such, the next main step of the demo performs the global clustering ofHATCHet2t2 where genomic bins which have the same copy-number state in every tumor clone are clustered correspondingly. To do this, we use `cluster-bins`, i.e. thHATCHet2et2's component designed for this purpose. At first, we attempt to run the clustering using the default values of the parameters as follows: ```shell ${CLUSTERBINS} samples.GATK4.CNV.bb -o samples.GATK4.CNV.seg -O samples.GATK4.CNV.bbc -e 12 -tB 0.03 -tR 0.15 -d 0.08 :<<'```shell' # Ignore this line ``` -To assess the quality of the clustering we generate the cluster plot using the `CBB` command of `plot-bins`, i.e. the HATCHet's component designed for the analysis of the data. For simplicity, we also use the following option `-tS 0.001` which asks to plot only the clusters which cover at least the `0.1%` of the genome. This is useful to clean the figure and focus on the main components. +To assess the quality of the clustering we generate the cluster plot using the `CBB` command of `plot-bins`, i.e. the HATCHet22's component designed for the analysis of the data. For simplicity, we also use the following option `-tS 0.001` which asks to plot only the clusters which cover at least the `0.1%` of the genome. This is useful to clean the figure and focus on the main components. ```shell ${PLOTBINS} -c CBB samples.GATK4.CNV.bbc -tS 0.001 @@ -88,8 +88,8 @@ We can easily notice that the clustering is good and not tuning is needed as eve ## hatchet's step -Next we apply `hatchet`, i.e. the component of HATCHet which estimates fractional copy numbers, infers allele-and-clone specific copy numbers, and jointly predicts the number of clones (including the normal clone) and the presence of a WGD. -We apply the last step with default parameters and, for simplicity of this demo, we consider 6 clones, which can be easily considered by HATCHet in this case, and we only consider 100 restarts for the coordinate-descent method; these are the number of attempts to find the best solution. This number is sufficient in this small example but we reccommend to use at least 400 restarts in standard runs. +Next we apply `hatchet`, i.e. the component of HATCHet22 which estimates fractional copy numbers, infers allele-and-clone specific copy numbers, and jointly predicts the number of clones (including the normal clone) and the presence of a WGD. +We apply the last step with default parameters and, for simplicity of this demo, we consider 6 clones, which can be easily considered by HATCHet22 in this case, and we only consider 100 restarts for the coordinate-descent method; these are the number of attempts to find the best solution. This number is sufficient in this small example but we reccommend to use at least 400 restarts in standard runs. ```shell ${INFER} -i samples.GATK4.CNV -n2,6 -p 100 -v 2 -u 0.03 -r 12 -eD 6 -eT 12 -l 0.6 |& tee hatchet.log @@ -117,11 +117,11 @@ We obtain the following summary of results: ## The related-tetraploid resulting files are copied to ./chosen.tetraploid.bbc.ucn and ./chosen.tetraploid.seg.ucn # The chosen solution is diploid with 3 clones and is written in ./best.bbc.ucn and ./best.seg.ucn -HATCHet predicts the presence of 3 clones in the 2 tumor samples and, especially, predicts that a sample contains two distinct tumor clones, according to the true clonal composition, and one of these clones is shared with the other sample. +HATCHet22 predicts the presence of 3 clones in the 2 tumor samples and, especially, predicts that a sample contains two distinct tumor clones, according to the true clonal composition, and one of these clones is shared with the other sample. ## Analyzing inferred results -Finally, we obtain useful plots to summarize and analyze the inferred results by using `plot-cn`, which is the last component of HATCHet. We run `plot-cn` as follows +Finally, we obtain useful plots to summarize and analyze the inferred results by using `plot-cn`, which is the last component of HATCHet22. We run `plot-cn` as follows ```shell ${PLOTCN} best.bbc.ucn diff --git a/custom/GATK4-CNV/gatk4cnsToBB.py b/custom/GATK4-CNV/gatk4cnsToBB.py index 0f97b55d..21482e08 100644 --- a/custom/GATK4-CNV/gatk4cnsToBB.py +++ b/custom/GATK4-CNV/gatk4cnsToBB.py @@ -11,7 +11,7 @@ def parse_args(): description = ( "This method takes in input multiple samples from the same patient, where each sample is a " - "segmented CNV file produced by GATK4 CNV pipeline, and produces a BB input file for HATCHet." + "segmented CNV file produced by GATK4 CNV pipeline, and produces a BB input file for HATCHet2." ) parser = argparse.ArgumentParser(description=description) parser.add_argument( @@ -19,7 +19,7 @@ def parse_args(): type=str, help=( "A white-space-separated list between apices where each element is a segmented CNV file produced by " - "GATK4 CNV pipeline. The file format is describe in the HATCHet's repository." + "GATK4 CNV pipeline. The file format is describe in the HATCHet2's repository." ), ) parser.add_argument( diff --git a/docs/buildDocs.sh b/docs/buildDocs.sh index 0530c2f3..6f8da7a4 100755 --- a/docs/buildDocs.sh +++ b/docs/buildDocs.sh @@ -27,9 +27,9 @@ git checkout -b gh-pages # Add README cat > README.md < -![](hatchet-cartoon.png "HATCHet algorithm") +![](hatchet-cartoon.png "HATCHet2 algorithm") -**Overview of HATCHet algorithm.** -1. HATCHet analyzes the read-depth ratio (RDR) and the B-allele frequency (BAF) in bins of the reference genome (black squares) jointly from multiple tumor samples. Here, we show two tumor samples *p* and *q*. -2. HATCHet globally clusters the bins based on RDR and BAF along the entire genome and jointly across samples *p* and *q*. Each cluster (color) includes bins with the same copy-number state within each clone present in *p* or *q*. -3. HATCHet estimates the fractional copy number of each cluster. If there is no WGD, the identification of the cluster (magenta) with copy-number state _(1, 1)_ is sufficient and RDRs are scaled correspondingly. If a WGD occurs, HATCHet finds the cluster with copy-number state _(2, 2)_ (same magenta cluster) and a second cluster having an identical copy-number state in all tumor clones. -4. HATCHet factorizes the allele-specific fractional copy numbers *F^A, F^B* into the allele-specific copy numbers *A, B*, respectively, and the clone proportions *U*. Here there is a normal clone and 3 tumor clones. -5. HATCHet's model selection criterion identifies the matrices *A*, *B* and *U* in the factorization while evaluating the fit according to both the inferred number of clones and presence/absence of a WGD. +**Overview of HATCHet2 algorithm.** +1. HATCHet2 analyzes the read-depth ratio (RDR) and the B-allele frequency (BAF) in bins of the reference genome (black squares) jointly from multiple tumor samples. Here, we show two tumor samples *p* and *q*. +2. HATCHet2 globally clusters the bins based on RDR and BAF along the entire genome and jointly across samples *p* and *q*. Each cluster (color) includes bins with the same copy-number state within each clone present in *p* or *q*. +3. HATCHet2 estimates the fractional copy number of each cluster. If there is no WGD, the identification of the cluster (magenta) with copy-number state _(1, 1)_ is sufficient and RDRs are scaled correspondingly. If a WGD occurs, HATCHet2 finds the cluster with copy-number state _(2, 2)_ (same magenta cluster) and a second cluster having an identical copy-number state in all tumor clones. +4. HATCHet2 factorizes the allele-specific fractional copy numbers *F^A, F^B* into the allele-specific copy numbers *A, B*, respectively, and the clone proportions *U*. Here there is a normal clone and 3 tumor clones. +5. HATCHet2's model selection criterion identifies the matrices *A*, *B* and *U* in the factorization while evaluating the fit according to both the inferred number of clones and presence/absence of a WGD. 6. Clusters are classified by their inferred copy-number states in each sample. *Sample-clonal clusters* have a unique copy-number state in the sample and correspond to evenly-spaced positions in the scaled RDR-BAF plot (vertical grid lines in each plot). *Sample-subclonal clusters* (e.g. cyan in *p*) have different copy-number states in a sample and thus correspond to intermediate positions in the scaled RDR-BAF plot. *Tumor-clonal clusters* have identical copy-number states in all tumor clones -- thus they are sample-clonal clusters in every sample and preserve their relative positions in scaled-RDR-BAF plots. In contrast, *tumor-subclonal clusters* have different copy-number states in different tumor clones and their relative positions in the scaled RDR-BAF plot varies across samples (e.g. purple cluster). -Note that this overview and figure do not include recently added features such as variable-width binning and locality-aware clustering that are currently default in HATCHet. These features will be described in a future publication. +Note that this overview and figure do not include recently added features such as variable-width binning and locality-aware clustering that are currently default in HATCHet2. These features will be described in a future publication. ### Software -The current implementation of HATCHet is composed of two sets of modules: +The current implementation of HATCHet2 is composed of two sets of modules: -(1) The *core* modules of HATCHet are designed to efficiently solve a challenging constrained and distance-based simultaneous matrix factorization which aim to infer allele and clone-specific copy numbers and clone proportins from fractional copy numbers. The module is implemented in C++11 and are included in `src` folder. +(1) The *core* modules of HATCHet2 are designed to efficiently solve a challenging constrained and distance-based simultaneous matrix factorization which aim to infer allele and clone-specific copy numbers and clone proportins from fractional copy numbers. The module is implemented in C++11 and are included in `src` folder. -(2) The *utility* modules of HATCHet perform several different tasks that are needed to process the raw data, perform steps of the HATCHet's algorithm needed for the factorization, and process the results. These task include reading/calling germinal single-point mutations, counting reads from a BAM file, combining the read counts and other information, segmenting through HATCHet's global approach, plotting very useful information, etc. These modules are implemented in python 3 and are available as the `util` and `bin` submodules. +(2) The *utility* modules of HATCHet2 perform several different tasks that are needed to process the raw data, perform steps of the HATCHet2's algorithm needed for the factorization, and process the results. These task include reading/calling germinal single-point mutations, counting reads from a BAM file, combining the read counts and other information, segmenting through HATCHet2's global approach, plotting very useful information, etc. These modules are implemented in python 3 and are available as the `util` and `bin` submodules. ## Setup @@ -76,7 +71,7 @@ The setup process is composed of 3 steps: #### Standard Installation -HATCHet can be installed using an existing installation of `conda` (e.g. either the compact +HATCHet2 can be installed using an existing installation of `conda` (e.g. either the compact [Miniconda](https://docs.conda.io/en/latest/miniconda.html) or the complete [Anaconda](https://www.anaconda.com/)). We recommend creating a new environment in which to install `hatchet`: @@ -101,20 +96,20 @@ Then, `hatchet` can be installed with the following one-time command: conda install hatchet ``` -If you would like to run the reference-based phasing modules of HATCHet, please install the additional dependency `shapeit` from the channel `dranew`: +If you would like to run the reference-based phasing modules of HATCHet2, please install the additional dependency `shapeit` from the channel `dranew`: ```shell conda install -c dranew shapeit ``` #### Manual Installation -If you wish to install `HATCHet` directly from this repository, the steps are a bit more involved. Please refer to the +If you wish to install `HATCHet2` directly from this repository, the steps are a bit more involved. Please refer to the [Manual Installation](doc_manual_install.md) document for more details. ### Using a Solver -Every run of HATCHet (specifically, the `compute-cn` step) needs to use a [Pyomo](https://pyomo.readthedocs.io/en/stable/solving_pyomo_models.html#supported-solvers) supported solver. By default, the HATCHet is compiled against [Gurobi](https://www.gurobi.com/), so the easiest (and fastest) option is to use a valid Gurobi license. +Every run of HATCHet2 (specifically, the `compute-cn` step) needs to use a [Pyomo](https://pyomo.readthedocs.io/en/stable/solving_pyomo_models.html#supported-solvers) supported solver. By default, the HATCHet2 is compiled against [Gurobi](https://www.gurobi.com/), so the easiest (and fastest) option is to use a valid Gurobi license. #### Using Gurobi @@ -143,19 +138,19 @@ section of your `hatchet.ini` (if using the [hatchet run](doc_runhatchet.html) c solver. Make sure the relevant solver binaries are in your `$PATH`, otherwise Pyomo will not be able to find them correctly. -One HATCHet command that is very useful to sanity-check your solver is `hatchet check`. This command runs the HATCHet `compute_cn` step on a small set of pre-packaged data files and completes fairly quickly (a few seconds for the Gurobi optimizer, but up to a few minutes for glpk). Running this command will ensure that you have your solver settings (including licenses) set up correctly, so you should always run this command first before trying out the `compute_cn` step on your large data files. +One HATCHet2 command that is very useful to sanity-check your solver is `hatchet check`. This command runs the HATCHet2 `compute_cn` step on a small set of pre-packaged data files and completes fairly quickly (a few seconds for the Gurobi optimizer, but up to a few minutes for glpk). Running this command will ensure that you have your solver settings (including licenses) set up correctly, so you should always run this command first before trying out the `compute_cn` step on your large data files. ### Required data -HATCHet requires 3 input data files: +HATCHet2 requires 3 input data files: 1. One or more BAM files containing DNA sequencing reads obtained from tumor samples of a single patient. Every BAM file contains the sequencing reads from a sample ad needs to be indexed and sorted. For example, each BAM file can be easily indexed and sorted using [SAMtools](http://www.htslib.org/workflow/#mapping_to_variant). In addition, one can improve the quality of the data by processing the BAM files according to the [GATK Best Practices](https://software.broadinstitute.org/gatk/best-practices/). 2. A BAM file containg DNA sequencing reads obtained from a matched-normal sample of the same patient of the considered tumor samples. The BAM file needs to be indexed and sorted as the tumor BAM files. Also, the BAM files can be processed as the tumor BAM files. -3. A human reference genome. Ideally, one should consider the same human reference genome used to align the sequencing reads in the given BAM files. The most-used human reference genome are available at [GRC](https://www.ncbi.nlm.nih.gov/grc/human) or [UCSC](http://hgdownload.cse.ucsc.edu/downloads.html#human). Observe that human reference genomes use two different notations for chromosomes: either `1, 2, 3, 4, 5 ...` or `chr1, chr2, chr3, chr4, chr5 ...`. One needs to make sure all BAM files and reference genome share that same chromosome notation. When this is not the case, one needs to change the reference to guarantee consistency and needs to re-index the new reference (e.g. using [SAMtools](http://www.htslib.org/workflow/#mapping_to_variant)). Also, HATCHet requires that the name of each chromosome is the first word in each ID such that `>1 [ANYTHING] ... \n>2 [ANYTHING] ... \n>3 [ANYTHING] ...` or `>chr1 [ANYTHING] ... \n>chr2 [ANYTHING] ... \n>chr3 [ANYTHING]`. +3. A human reference genome. Ideally, one should consider the same human reference genome used to align the sequencing reads in the given BAM files. The most-used human reference genome are available at [GRC](https://www.ncbi.nlm.nih.gov/grc/human) or [UCSC](http://hgdownload.cse.ucsc.edu/downloads.html#human). Observe that human reference genomes use two different notations for chromosomes: either `1, 2, 3, 4, 5 ...` or `chr1, chr2, chr3, chr4, chr5 ...`. One needs to make sure all BAM files and reference genome share that same chromosome notation. When this is not the case, one needs to change the reference to guarantee consistency and needs to re-index the new reference (e.g. using [SAMtools](http://www.htslib.org/workflow/#mapping_to_variant)). Also, HATCHet2 requires that the name of each chromosome is the first word in each ID such that `>1 [ANYTHING] ... \n>2 [ANYTHING] ... \n>3 [ANYTHING] ...` or `>chr1 [ANYTHING] ... \n>chr2 [ANYTHING] ... \n>chr3 [ANYTHING]`. - For the reference genome, HATCHet requires the existence of a a sequence dictionary (`.dict`), which is part of all standard pipelines for sequencing data, see for example [GATK](https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) or [Galaxy](https://galaxyproject.org/admin/data-preparation/). Please note that the sequence dictionary is **NOT** the reference index `.fai`, which is a different structure, has a different function, and it is also recommended. + For the reference genome, HATCHet2 requires the existence of a a sequence dictionary (`.dict`), which is part of all standard pipelines for sequencing data, see for example [GATK](https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) or [Galaxy](https://galaxyproject.org/admin/data-preparation/). Please note that the sequence dictionary is **NOT** the reference index `.fai`, which is a different structure, has a different function, and it is also recommended. The dictionary of a reference genome is often included in the available bundles for the reference genomes, see the [example for hg19](ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg19) from Broad Institute. However, the dictionary can also be generated in seconds using either [SAMtools](http://www.htslib.org/doc/samtools-dict.html) or [Picard tools](https://gatk.broadinstitute.org/hc/en-us/articles/360036729911-CreateSequenceDictionary-Picard-). @@ -170,25 +165,25 @@ HATCHet requires 3 input data files: ## Usage -The repository includes all the components that are required to cover every step of the entire HATCHet's pipeline, starting from the processing of raw data reported in a BAM file through the analysis of the final results. +The repository includes all the components that are required to cover every step of the entire HATCHet2's pipeline, starting from the processing of raw data reported in a BAM file through the analysis of the final results. We provide: -- A script representing the [full pipeline](doc_fullpipeline.html#fullpipelineandtutorial) of HATCHet, and we describe in details the whole script through a tutorial with instructions for usage. -- [Demos](doc_fullpipeline.html#demos) that correspond to guided executions of HATCHet on some examples, and explain in detail the usage of HATCHet when considering standard datasets, real datasets with high noise, and different kind of data. -- [Custom pipelines](doc_fullpipeline.html#custompipelines) which adapt the full HATCHet's pipeline to special conditions or integrates pre-processed data belonging to different pipelines. +- A script representing the [full pipeline](doc_fullpipeline.html#fullpipelineandtutorial) of HATCHet2, and we describe in details the whole script through a tutorial with instructions for usage. +- [Demos](doc_fullpipeline.html#demos) that correspond to guided executions of HATCHet2 on some examples, and explain in detail the usage of HATCHet2 when considering standard datasets, real datasets with high noise, and different kind of data. +- [Custom pipelines](doc_fullpipeline.html#custompipelines) which adapt the full HATCHet2's pipeline to special conditions or integrates pre-processed data belonging to different pipelines. - The implementation of HATCHet is highly modular and one can replace any HATCHet's module with any other method to obtain the required results (especially for the pre-processing modules). + The implementation of HATCHet2 is highly modular and one can replace any HATCHet2's module with any other method to obtain the required results (especially for the pre-processing modules). As such, we also provide here an overview of the entire pipeline and we describe the [details of each step](doc_fullpipeline.html#detailedsteps) in a dedicated section of the manual. -- [Recommendations](doc_fullpipeline.html#recommendations), especially for noisy datasets or with different features, to guide the user in the interpretation of HATCHet's inference. We explain how to perform quality control to guarantee the best-quality results, and describe how the user can control and tune some of the parameters to obtain the best-fitting results. +- [Recommendations](doc_fullpipeline.html#recommendations), especially for noisy datasets or with different features, to guide the user in the interpretation of HATCHet2's inference. We explain how to perform quality control to guarantee the best-quality results, and describe how the user can control and tune some of the parameters to obtain the best-fitting results. ## Current issues -HATCHet is in active development, please report any issue or question as this could help the devolment and imporvement of HATCHet. Current known issues with current version are reported here below: +HATCHet2 is in active development, please report any issue or question as this could help the devolment and imporvement of HATCHet2. Current known issues with current version are reported here below: - The allele-swapping feature of combine-counts has been temporarily disabled due to conflicts with recent SAMtools versions. -- HATCHet has not been tested on Windows yet. For Windows users, we recommend [Windows Subsystems for Linux](https://docs.microsoft.com/en-us/windows/wsl/install). +- HATCHet2 has not been tested on Windows yet. For Windows users, we recommend [Windows Subsystems for Linux](https://docs.microsoft.com/en-us/windows/wsl/install). A list of the major recent updates: - Variable-width binning which adapts to the sequencing coverage and observed heterzygous germline SNP positions @@ -199,4 +194,4 @@ A list of the major recent updates: ## Contacts -HATCHet is maintained by the research groups of Prof. Ben Raphael (Princeton) [[email]](mailto:braphael@princeton.edu) and Prof. Simone Zaccaria (UCL) [[email]](mailto:s.zaccaria@ucl.ac.uk). Major contributors include Matt Myers, Vineet Bansal, and Brian Arnold. +HATCHet2 is maintained by the research groups of Prof. Ben Raphael (Princeton) [[email]](mailto:braphael@princeton.edu) and Prof. Simone Zaccaria (UCL) [[email]](mailto:s.zaccaria@ucl.ac.uk). Major contributors include Matt Myers, Vineet Bansal, and Brian Arnold. diff --git a/docs/source/conf.py b/docs/source/conf.py index f9bf66ee..9ebfff0c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -63,9 +63,9 @@ master_doc = "index" # General information about the project. -project = "HATCHet" -copyright = "2021, Princeton University" -author = "Simone Zaccaria" +project = 'HATCHet2' +copyright = '2024, Princeton University' +author = 'Matthew Myers, Simone Zaccaria, Vineet Bansal, and Brian Arnold' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/docs/source/doc_check.md b/docs/source/doc_check.md index 57e0bc2a..d3b158b6 100644 --- a/docs/source/doc_check.md +++ b/docs/source/doc_check.md @@ -1,10 +1,10 @@ # check -This command of HATCHet verifies that dependencies that are unique to specific parts of HATCHet are correctly installed. The relevant commands with dependencies are `count-reads`, `phase-snps`, and `compute-cn`. +This command of HATCHet2 verifies that dependencies that are unique to specific parts of HATCHet2 are correctly installed. The relevant commands with dependencies are `count-reads`, `phase-snps`, and `compute-cn`. All checks can be run simultaneously via `hatchet check`, or an individual command can be checked via, e.g., `hatchet check compute-cn`. -The check for `compute-cn` runs the step on a set of small data files (.bbc/.seg) pre-packaged with HATCHet, and is a quick way to verify if your solver is working correctly. +The check for `compute-cn` runs the step on a set of small data files (.bbc/.seg) pre-packaged with HATCHet2, and is a quick way to verify if your solver is working correctly. If you are unable to run this command, it likely indicates a licensing issue with default (Gurobi) solver. To use alternative solvers, see the [Using a different Pyomo-supported solver](README.html#usingasolver_other) section of the README for more details. diff --git a/docs/source/doc_combine_counts.md b/docs/source/doc_combine_counts.md index f0f4012e..7ff835d5 100644 --- a/docs/source/doc_combine_counts.md +++ b/docs/source/doc_combine_counts.md @@ -48,7 +48,7 @@ combine-counts produces a tab-separated file (`-o, --outfile`) with the followin | `NORMAL_READS` | Total number of reads in the bin in the matched normal sample | | `CORRECTED_READS` | Total number of reads in the bin in `SAMPLE`, corrected by the total reads in `SAMPLE` vs. the total reads in matched normal. | -Currently, it produces one such file that excludes sex chromosomes (for use in HATCHet), and one that includes sex chromosomes (for future use). +Currently, it produces one such file that excludes sex chromosomes (for use in HATCHet2), and one that includes sex chromosomes (for future use). ## Main parameters @@ -64,7 +64,7 @@ A phased VCF file must be given via argument `-p, --phase` to apply reference-ba | Name | Description | Usage | Default | |------|-------------|-------|---------| -| `-p`, `--phase` | vcf.gz with phasing for all het. SNPs | File containing phasing data for germline SNPs, typically `phased.vcf.gz` if using the HATCHet pipeline. | None (no phasing is performed) | +| `-p`, `--phase` | vcf.gz with phasing for all het. SNPs | File containing phasing data for germline SNPs, typically `phased.vcf.gz` if using the HATCHet2 pipeline. | None (no phasing is performed) | | `-s`, `--blocksize` | Maximum phasing block size | Maximum distance (in bp) between a pair of SNPs included in the same phasing block (ignored if `-p, --phase` is not used) | 25000 | | `-m`, `--max_spb` | Maximum number of SNPs per phased block | No more than this many SNPs can be included in the same phasing block (included to minimize phasing errors in high-LD regions) | 10 | | `-a`, `--alpha` | Significance threshold to allow adjacent SNPs to be merged | If adjacent SNPs have significantly different BAFs (at this significance level) after taking the phasing into account, they are not merged a priori. Higher means less trust in phasing. | 0.1 | diff --git a/docs/source/doc_compilation.md b/docs/source/doc_compilation.md index 65e8698c..c53f8d99 100644 --- a/docs/source/doc_compilation.md +++ b/docs/source/doc_compilation.md @@ -1,6 +1,6 @@ # Detailed compilation -To perform the compilation, execute the following commands from the root of HATCHet's repository. +To perform the compilation, execute the following commands from the root of HATCHet2's repository. ```shell $ mkdir build @@ -9,7 +9,7 @@ $ ccmake .. $ make ``` -HATCHet's compilation process attempts to automatically find the following Gurobi's paths. +HATCHet2's compilation process attempts to automatically find the following Gurobi's paths. | Name | Path | Comment | |------|------|---------| diff --git a/docs/source/doc_compute_cn.md b/docs/source/doc_compute_cn.md index 2fb333ee..69793f7a 100644 --- a/docs/source/doc_compute_cn.md +++ b/docs/source/doc_compute_cn.md @@ -110,7 +110,7 @@ hatchet solves a constrained and distance-based variant of the factorization tha | `-f`, `--noampdel` | Activate clone evolutionary contraints | User can decide whether to enable or not constrained about the evolution of tumor clones. These constrained force each allele to be either amplified or deleted across all tumor clones | Activated | | `-d`, `--cnstates` | Maximum number of distinct copy-number states per cluster | When enabled, the maximum number of distinct copy-number states per cluster is fixed. This option is deprecated | Not used | -HATCHet implements two methods to solve the constrained and distance-based simultaneous factorization: (1) a integer-linear programming (ILP) and (2) a coordinate-descent method (CD). +HATCHet2 implements two methods to solve the constrained and distance-based simultaneous factorization: (1) a integer-linear programming (ILP) and (2) a coordinate-descent method (CD). These methods can be combined in 3 different modes: - (0) CD + ILP: the solution found by CD is used to start the ILP. As such, ILP attempts to improve the solution found by CD. - (1) ILP only @@ -127,8 +127,8 @@ In addition, the solving methods can be controlled by the following parameters. | `-s`, `--timelimit` | Time limit | The time limit, expressed in seconds, is imposed to every step of the CD algorithm or to the whole ILP | None | | `-m`, `--timelimit` | Memory limit | The memory limit, expressed in megabytes, is imposed to every step of the CD algorithm or to the whole ILP. The execution will not be interrupted when reaching the threshold but disk is used | None | | `--maxiterations` | Maximum number of iteration per seed | This number is imposed as the maximum number of iterations executed for every restart of the CD algorithm | 40 | -| `--diploid` | Assume no WGD | When enabled, HATCHet assumes the absence of a WGD | Not used | -| `--tetraloid` | Assume a WGD | When enabled, HATCHet assumes the occurrence of a WGD | Not used | +| `--diploid` | Assume no WGD | When enabled, HATCHet2 assumes the absence of a WGD | Not used | +| `--tetraloid` | Assume a WGD | When enabled, HATCHet2 assumes the occurrence of a WGD | Not used | ## Model selection @@ -136,7 +136,7 @@ This steps have two main parameters to control the model-selection criterion: | Name | Description | Usage | Default | |------|-------------|-------|---------| -| `-l`, `--limitinc` | Sensitivity level | The sensitivity level is used to control the confidence in evaluating the presence of tumor clones characterized by small CNAs. By decreasing the value of the sensitivity, HATCHet is more sensible to the presence of small CNAs and small clusters or with small shifts in RDR/BAF are more likely considered as the signal of an additional tumor clone. The possible values of this parameter are between 1.0 and 0.0 and specifically corresponds to an upper bound for the left relative improvement of the objective function. | None, reasonable values to use can be 0.6, 0.5, 0.4, 0.3, 0.2, ... according to the values of the objective function. | -| `-g`, `--ghostprop` | Confidence in the presence of a single tumor clone | This value expresses the confidence of HATCHet when evaluating the presence of a single tumor clone. The higher the value the more likely the presence of a single clone is considered | 0.2 | +| `-l`, `--limitinc` | Sensitivity level | The sensitivity level is used to control the confidence in evaluating the presence of tumor clones characterized by small CNAs. By decreasing the value of the sensitivity, HATCHet2 is more sensible to the presence of small CNAs and small clusters or with small shifts in RDR/BAF are more likely considered as the signal of an additional tumor clone. The possible values of this parameter are between 1.0 and 0.0 and specifically corresponds to an upper bound for the left relative improvement of the objective function. | None, reasonable values to use can be 0.6, 0.5, 0.4, 0.3, 0.2, ... according to the values of the objective function. | +| `-g`, `--ghostprop` | Confidence in the presence of a single tumor clone | This value expresses the confidence of HATCHet2 when evaluating the presence of a single tumor clone. The higher the value the more likely the presence of a single clone is considered | 0.2 | ## Additional parameters diff --git a/docs/source/doc_count_alleles.md b/docs/source/doc_count_alleles.md index 876c1b03..836352b0 100644 --- a/docs/source/doc_count_alleles.md +++ b/docs/source/doc_count_alleles.md @@ -1,6 +1,6 @@ # count-alleles -Given one or more BAM files and lists of heterozygous SNP positions, this step of HATCHet counts the number of reads covering both the alleles of each identified heterozgyous SNP in every tumor sample. +Given one or more BAM files and lists of heterozygous SNP positions, this step of HATCHet2 counts the number of reads covering both the alleles of each identified heterozgyous SNP in every tumor sample. ## Input @@ -8,7 +8,7 @@ count-alleles takes in input sorted and indexed BAM files for multiple tumor sam | Name | Description | Usage | |------|-------------|-------| -| `-T`, `--tumors` | A white-space separated list of sorted-indexed BAM files | The tumor samples from the same patient that are jointly analyzed by HATCHet | +| `-T`, `--tumors` | A white-space separated list of sorted-indexed BAM files | The tumor samples from the same patient that are jointly analyzed by HATCHet2 | | `-N`, `--normal` | A sorted-indexed BAM file | The matched normal sample for the same patient | | `-L`, `--snps` | VCF files | One or more files listing heterozygous SNP positions | | `-r`, `--reference` | A FASTA file | The human reference genome used for germline variant calling | diff --git a/docs/source/doc_count_reads.md b/docs/source/doc_count_reads.md index ded3564c..2b81b36f 100644 --- a/docs/source/doc_count_reads.md +++ b/docs/source/doc_count_reads.md @@ -1,6 +1,6 @@ # count-reads -This step of HATCHet uses the locations of heterozygous SNPs (called by `count-alleles`) to identify candidate bin thresholds between SNPs. Then, it counts the total number of reads in each sample between each set of candidate thresholds for use in constructing variable-length bins. +This step of HATCHet2 uses the locations of heterozygous SNPs (called by `count-alleles`) to identify candidate bin thresholds between SNPs. Then, it counts the total number of reads in each sample between each set of candidate thresholds for use in constructing variable-length bins. ## Input @@ -8,7 +8,7 @@ This step of HATCHet uses the locations of heterozygous SNPs (called by `count-a | Name | Description | Usage | |------|-------------|-------| -| `-T`, `--tumors` | A white-space separated list of sorted-indexed BAM files for tumor samples | The tumor samples from the same patient that are jointly analyzed by HATCHet | +| `-T`, `--tumors` | A white-space separated list of sorted-indexed BAM files for tumor samples | The tumor samples from the same patient that are jointly analyzed by HATCHet2 | | `-N`, `--normal` | A sorted-indexed BAM file for matched-normal sample | The matched normal sample for the same patient | | `-b`, `--baffile` | A 1bed file containing locations of heterozygous germline SNPs | Typically, a user would run `count-alleles` to obtain this file. | | `-V`, `--refversion` | Reference genome version (hg19 or hg38 supported) | | diff --git a/docs/source/doc_count_reads_fw.md b/docs/source/doc_count_reads_fw.md index dd623581..03ec2dad 100644 --- a/docs/source/doc_count_reads_fw.md +++ b/docs/source/doc_count_reads_fw.md @@ -2,7 +2,7 @@ NOTE: This function (formerly called `comBBo`) uses the legacy fixed-width binning described in the HATCHet paper. We recommend using [`count-reads`](doc_count_reads.md) and [`combine-counts`](doc_combine-counts.md) which apply an adaptive binning scheme to ensure that each genomic bin has comparable BAF signal. -This step of HATCHet splits the human reference genome into fixed-width bins (i.e., small genomic regions), and computes the number of sequencing reads aligned to each bin from every given tumor samples and from the matched normal sample. +This step of HATCHet2 splits the human reference genome into fixed-width bins (i.e., small genomic regions), and computes the number of sequencing reads aligned to each bin from every given tumor samples and from the matched normal sample. ## Input @@ -10,7 +10,7 @@ count-reads-fw takes in input sorted and indexed BAM files for multiple tumor sa | Name | Description | Usage | |------|-------------|-------| -| `-T`, `--tumors` | A white-space separated list of sorted-indexed BAM files for tumor samples | The tumor samples from the same patient that are jointly analyzed by HATCHet | +| `-T`, `--tumors` | A white-space separated list of sorted-indexed BAM files for tumor samples | The tumor samples from the same patient that are jointly analyzed by HATCHet2 | | `-N`, `--normal` | A sorted-indexed BAM file for matched-normal sample | The matched normal sample for the same patient | | `-r`, `--reference` | A FASTA file | The human reference genome used for germline variant calling | diff --git a/docs/source/doc_download_panel.md b/docs/source/doc_download_panel.md index 13a6aa11..05b058d3 100644 --- a/docs/source/doc_download_panel.md +++ b/docs/source/doc_download_panel.md @@ -1,6 +1,6 @@ # download-panel -This step of HATCHet downloads the 1000 genomes reference panel to phase germline mutations. It also downloads and creates other files necessary for phasing when the user has aligned their reads to a version of the human reference genome that is not the same as the version used in the 1000 genomes project (which was hg19, with no 'chr' prefix preceding chromosome names). +This step of HATCHet2 downloads the 1000 genomes reference panel to phase germline mutations. It also downloads and creates other files necessary for phasing when the user has aligned their reads to a version of the human reference genome that is not the same as the version used in the 1000 genomes project (which was hg19, with no 'chr' prefix preceding chromosome names). **Note:** This step requires access to the internet in order to download files. This step only needs to be run once per system. diff --git a/docs/source/doc_fullpipeline.md b/docs/source/doc_fullpipeline.md index cfbd7835..352e1caa 100644 --- a/docs/source/doc_fullpipeline.md +++ b/docs/source/doc_fullpipeline.md @@ -1,18 +1,18 @@ ## Full pipeline and tutorial -We provide example [BASH scripts](script/README.md) that implement the entire pipeline of HATCHet. +We provide example [BASH scripts](script/README.md) that implement the entire pipeline of HATCHet2. This script and its usage are described in detailed in a guided [tutorial](doc_runhatchet.md). -The user can simply use the script for every execution of HATCHet on different data by copying the script inside the running directory and changing the corresponding paths of the required data and dependencies at the beginning of the script, as described in the guided [tutorial](doc_runhatchet.md). +The user can simply use the script for every execution of HATCHet2 on different data by copying the script inside the running directory and changing the corresponding paths of the required data and dependencies at the beginning of the script, as described in the guided [tutorial](doc_runhatchet.md). ## Demos -Each demo is an example and guided execution of HATCHet on a dataset included in the corresponding demo's folder of this -repository (inside `examples`). The demos are meant to illustrate how the user should apply HATCHet on different -datasets characterized by different features, noise, and kind of data. In fact, the default parameters of HATCHet allow +Each demo is an example and guided execution of HATCHet2 on a dataset included in the corresponding demo's folder of this +repository (inside `examples`). The demos are meant to illustrate how the user should apply HATCHet2 on different +datasets characterized by different features, noise, and kind of data. In fact, the default parameters of HATCHet2 allow to successfully analyze most of the datasets but some of these may be characterized by special features or -higher-than-expected variance of the data. Understanding the functioning of HATCHet, assessing the quality of the +higher-than-expected variance of the data. Understanding the functioning of HATCHet2, assessing the quality of the results, and tuning the few parameters needed to fit the unique features of the considered data thus become crucial to guarantee to always obtain the best-quality results. These are the goals of these demos. @@ -23,7 +23,7 @@ time the following demos are available (more demos will be added in the near fut | Demo | Description | |------|-------------| -| [demo-complete](examples/demo-complete/demo-complete.html) | A demo of the complete HATCHet pipeline starting from an example dataset of tumour and matched normal BAM files | +| [demo-complete](examples/demo-complete/demo-complete.html) | A demo of the complete HATCHet2 pipeline starting from an example dataset of tumour and matched normal BAM files | | [demo-wgs-sim](examples/demo-WGS-sim/demo-wgs-sim.html) | A demo on a typical WGS (whole-genome sequencing) multi-sample dataset with standard noise and variance of the data | | [demo-wgs-cancer](examples/demo-WGS-cancer/demo-wgs-cancer.html) | A demo on a cancer WGS (whole-genome sequencing) multi-sample dataset with high noise and variance of the data | | [demo-wes](examples/demo-WES/demo-wes.html) | A demo on a cancer WES (whole-exome sequencing) multi-sample dataset, which is typycally characterized by very high variance of RDR | @@ -31,34 +31,34 @@ time the following demos are available (more demos will be added in the near fut ## Custom pipelines -The repository includes custom pipelines which have been designed to adapt the complete pipeline of HATCHet to special +The repository includes custom pipelines which have been designed to adapt the complete pipeline of HATCHet2 to special conditions or to integrate the processed data produced by other pipelines. Each custom pipeline is a variation of the -main HATCHet's pipeline, we thus recommend the user to always first carefully understand the main +main HATCHet2's pipeline, we thus recommend the user to always first carefully understand the main [BASH script](script/README.md) through the corresponding guided [tutorial](doc_runhatchet.md) and to carefully -understand the provided [demos](#demos) to properly apply HATCHet for best-quality results. Each custom pipeline also +understand the provided [demos](#demos) to properly apply HATCHet2 for best-quality results. Each custom pipeline also includes a specific demo which represent a guided and executable example on example data. | Name | Description | Script | Demo | Variations | |------|-------------|--------|------|------------| -| GATK4-CNV | Custom pipeline for segmented files from GATK4 CNV pipeline | [custom-gatk4-cnv.sh](custom-gatk4-cnv.sh) | [demo-gatk4-cnv.sh](custom/GATK4-CNV/demo-gatk4-cnv.html) | This custom pipeline takes the input the segmented files which already contain the estimated RDR and BAF. As such, the first pre-processing steps of HATCHet (`count-reads`, `count-alleles`, and `combine-counts`) are not needed; for this reason, the following depdencies SAMtools and BCFtools and the following required data, human reference genome, matched-normal sample, and BAM files, are not needed in this case. | +| GATK4-CNV | Custom pipeline for segmented files from GATK4 CNV pipeline | [custom-gatk4-cnv.sh](custom-gatk4-cnv.sh) | [demo-gatk4-cnv.sh](custom/GATK4-CNV/demo-gatk4-cnv.html) | This custom pipeline takes the input the segmented files which already contain the estimated RDR and BAF. As such, the first pre-processing steps of HATCHet2 (`count-reads`, `count-alleles`, and `combine-counts`) are not needed; for this reason, the following depdencies SAMtools and BCFtools and the following required data, human reference genome, matched-normal sample, and BAM files, are not needed in this case. | ## Detailed steps -The full pipeline of HATCHet is composed of 9 sequential steps (and an additional dependency-checking command), starting from the required input data. +The full pipeline of HATCHet2 is composed of 9 sequential steps (and an additional dependency-checking command), starting from the required input data. The description of each step also includes the details of the corresponding input/output that are especially useful when -one wants to replace or change some of the steps in the pipeline while guaranteeing the correct functioning of HATCHet. -Each step `` of HATCHet can be run with the following command within a HATCHet conda environment: +one wants to replace or change some of the steps in the pipeline while guaranteeing the correct functioning of HATCHet2. +Each step `` of HATCHet2 can be run with the following command within a HATCHet2 conda environment: ```shell hatchet ``` -**Note**: This version of HATCHet uses variable-width bins to ensure that each bin has comparable B-allele frequency (BAF) signal from heterogeneous germline SNPs and to account for sequencing coverage. To run the older versions of HATCHet with fixed-width bins, use [*count-reads-fw*](doc_count_reads_fw.html) (formerly binBAM) instead of *count-reads* and [*combine-counts-fw*](doc_combine_counts_fw.html) (formerly comBBo) instead of *combine-counts*. If you are using the `run` command, set `fixed_width = True` under the `[run]` header in your `.ini` file. +**Note**: This version of HATCHet2 uses variable-width bins to ensure that each bin has comparable B-allele frequency (BAF) signal from heterogeneous germline SNPs and to account for sequencing coverage. To run the older versions of HATCHet2 with fixed-width bins, use [*count-reads-fw*](doc_count_reads_fw.html) (formerly binBAM) instead of *count-reads* and [*combine-counts-fw*](doc_combine_counts_fw.html) (formerly comBBo) instead of *combine-counts*. If you are using the `run` command, set `fixed_width = True` under the `[run]` header in your `.ini` file. -Additionally, this version of HATCHet uses a new locality-aware clustering module that incorporates local information along the genome. This replaces the previous GMM-based clustering (command named cluBB). Users can still use GMM-based clustering if they wish by setting `loc-clust` to `False` under the `[run]` header in `hatchet.ini`, or by running [*cluster-bins-gmm*](doc_cluster_bins_gmm.md) directly. +Additionally, this version of HATCHet2 uses a new locality-aware clustering module that incorporates local information along the genome. This replaces the previous GMM-based clustering (command named cluBB). Users can still use GMM-based clustering if they wish by setting `loc-clust` to `False` under the `[run]` header in `hatchet.ini`, or by running [*cluster-bins-gmm*](doc_cluster_bins_gmm.md) directly. -*Older versions of HATCHet used different names for these steps. The `Old Name` column lists those names.* +*Older versions of HATCHet2 used different names for these steps. The `Old Name` column lists those names.* | Order | Step | Old Name | Description | |-------|------|----------|-------------| @@ -76,11 +76,11 @@ Additionally, this version of HATCHet uses a new locality-aware clustering modul ## Recommendations and quality control -All the components of HATCHet's pipeline use some basic parameters that allow to deal with data characterized by different features. The default values of these parameters allow one to succesfully apply HATCHet on most datasets. However, special or noisy datasets may require to tune some parameters. The user can deal with these cases by following the recommendations reported here, reading the descriptions of the various steps, and using the informative plots to verify the results. In the following guides and recommentations, we guide the user in the interpration of HATCHet's inference, we explain how to perform quality control to guarantee the best-quality results, and we describe how the user can control and tune some of the parameters to obtain the best-fitting results. We thus split the recommendations into distinct topics with dedicated descriptions. +All the components of HATCHet2's pipeline use some basic parameters that allow to deal with data characterized by different features. The default values of these parameters allow one to succesfully apply HATCHet2 on most datasets. However, special or noisy datasets may require to tune some parameters. The user can deal with these cases by following the recommendations reported here, reading the descriptions of the various steps, and using the informative plots to verify the results. In the following guides and recommentations, we guide the user in the interpration of HATCHet2's inference, we explain how to perform quality control to guarantee the best-quality results, and we describe how the user can control and tune some of the parameters to obtain the best-fitting results. We thus split the recommendations into distinct topics with dedicated descriptions. | Recommendation | Description | |----------------|-------------| -| [Analyze HATCHet inference](recommendation_inference.html) | Interpret HATCHet's inference, quality and error control, and investigate alternative solutions. | +| [Analyze HATCHet2 inference](recommendation_inference.html) | Interpret HATCHet2's inference, quality and error control, and investigate alternative solutions. | | [Analyze global clustering](recommendation_clustering.html) | Interprent global clustering, quality and error control, and parameter tuning | | [Analyze different type of data](recommendation_datatype.html) | Tuning parameters to better analyzing different type of data as those from WES | | [Improve running time](recommendation_runtime.html)| Tips for improving running time of the whole pipeline | diff --git a/docs/source/doc_genotype_snps.md b/docs/source/doc_genotype_snps.md index 11b43c3d..22063913 100644 --- a/docs/source/doc_genotype_snps.md +++ b/docs/source/doc_genotype_snps.md @@ -1,6 +1,6 @@ # genotype-snps -Given the normal BAM file, this step of HATCHet identifies heterozygous germline SNP positions. The user can restrict candidate positions to a given list (e.g., dbSNP) using the `-R, --snps` argument. +Given the normal BAM file, this step of HATCHet2 identifies heterozygous germline SNP positions. The user can restrict candidate positions to a given list (e.g., dbSNP) using the `-R, --snps` argument. ## Input diff --git a/docs/source/doc_manual_install.md b/docs/source/doc_manual_install.md index 0164ef28..4eabe718 100644 --- a/docs/source/doc_manual_install.md +++ b/docs/source/doc_manual_install.md @@ -1,17 +1,17 @@ ### Manual Installation -If you wish to install `HATCHet` directly from this repository, the steps are a bit more involved. +If you wish to install `HATCHet2` directly from this repository, the steps are a bit more involved. Note that the complexity of manual installation is largely because the `compute-cn` step (determination of -allele-specific copy numbers) of the HATCHet pipeline uses custom-written C++11 code that uses the +allele-specific copy numbers) of the HATCHet2 pipeline uses custom-written C++11 code that uses the [Gurobi](http://www.gurobi.com/) optimizer. If you do not have a valid Gurobi license (though it is [easily available](http://www.gurobi.com/academia/academia-center) for users in academia), then the C++ parts of -HATCHet do not necessarily need to be compiled, and you can read the -[Compiling HATCHet without the built-in Gurobi optimizer](#withoutgurobi) section of this page. +HATCHet2 do not necessarily need to be compiled, and you can read the +[Compiling HATCHet2 without the built-in Gurobi optimizer](#withoutgurobi) section of this page. -#### Compiling HATCHet with the built-in Gurobi optimizer +#### Compiling HATCHet2 with the built-in Gurobi optimizer -The core optimization module of HATCHet is written in C++11 and thus requires a modern C++ compiler (GCC >= 4.8.1, or Clang). +The core optimization module of HATCHet2 is written in C++11 and thus requires a modern C++ compiler (GCC >= 4.8.1, or Clang). As long as you have a recent version of GCC or Clang installed, `setuptools` should automatically be able to download a recent version of `cmake` and compile the Hatchet code into a working package. @@ -19,7 +19,7 @@ The installation process can be broken down into the following steps: 1. **Get [Gurobi](http://www.gurobi.com/)** (v9.0.2) - The coordinate-method applied by HATCHet is based on several integer linear programming (ILP) formulations. Gurobi is a commercial ILP solver with two licensing options: (1) a single-host license where the license is tied to a single computer and (2) a network license for use in a compute cluster (using a license server in the cluster). Both options are freely and [easily available](http://www.gurobi.com/academia/academia-center) for users in academia. + The coordinate-method applied by HATCHet2 is based on several integer linear programming (ILP) formulations. Gurobi is a commercial ILP solver with two licensing options: (1) a single-host license where the license is tied to a single computer and (2) a network license for use in a compute cluster (using a license server in the cluster). Both options are freely and [easily available](http://www.gurobi.com/academia/academia-center) for users in academia. [Download](https://www.gurobi.com/downloads/gurobi-optimizer-eula) Gurobi for your specific platform. @@ -64,9 +64,9 @@ want to create either a new Conda environment for Python 3 and activate it: ``` -6. **Build and install HATCHet** +6. **Build and install HATCHet2** - Execute the following commands from the root of HATCHet's repository. + Execute the following commands from the root of HATCHet2's repository. ```shell $ pip install . ``` @@ -90,7 +90,7 @@ want to create either a new Conda environment for Python 3 and activate it: If you want to perform reference-based phasing, you must also install [shapeit](https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html), [picard](https://broadinstitute.github.io/picard/), and [bgzip](http://www.htslib.org/doc/). The easiest way to install these is via `conda`, as all are available from the `bioconda` channel (except `shapeit` which is available from channel `dranew`). -#### Compiling HATCHet *without* the built-in Gurobi optimizer +#### Compiling HATCHet2 *without* the built-in Gurobi optimizer If you wish to use an alternate ILP optimizer, then you do not need a C++ compiler. diff --git a/docs/source/doc_phase_snps.md b/docs/source/doc_phase_snps.md index d65686d1..1deeeac2 100644 --- a/docs/source/doc_phase_snps.md +++ b/docs/source/doc_phase_snps.md @@ -2,7 +2,7 @@ **Note:** To run this step, you must first run [*download-panel*](doc_download_panel.html) to download the reference-based phasing panel, and specify its location via the argument `-D, --refpaneldir`. The `download-panel` command only needs to be run once per system. -This step of HATCHet phases genotypes found in VCF files. It automatically takes care of differences in coordinates if the user has aligned their reads to a version of the reference genome (e.g. hg38) that is different from the version used in the reference panel (e.g. hg19 for the 1000 genomes project), using the liftover utility within Picard. Once genotypes are lifted over and phased, we again perform one last liftover to the coordinates of the genome used for alignment. Liftover is skipped if the version of the reference genome used for alignments corresponds to the same version used in the reference panel. Lastly, in order to account for differences in naming conventions for chromosomes, with or without the "chr" prefix, we also add or remove these prefixes so that chromosome names correspond to those used in the reference panel (without "chr" for the 1000 genomes project hg19 panel). +This step of HATCHet2 phases genotypes found in VCF files. It automatically takes care of differences in coordinates if the user has aligned their reads to a version of the reference genome (e.g. hg38) that is different from the version used in the reference panel (e.g. hg19 for the 1000 genomes project), using the liftover utility within Picard. Once genotypes are lifted over and phased, we again perform one last liftover to the coordinates of the genome used for alignment. Liftover is skipped if the version of the reference genome used for alignments corresponds to the same version used in the reference panel. Lastly, in order to account for differences in naming conventions for chromosomes, with or without the "chr" prefix, we also add or remove these prefixes so that chromosome names correspond to those used in the reference panel (without "chr" for the 1000 genomes project hg19 panel). ## Input @@ -31,10 +31,10 @@ The following files will be placed in the directory indicated by `-o, --outdir`: ## Main Parameters -If HATCHet is installed via `conda`, the dependencies (`shapeit`, `picard`, `bcftools`, and `bgzip`) should be installed automatically and available on the PATH. +If HATCHet2 is installed via `conda`, the dependencies (`shapeit`, `picard`, `bcftools`, and `bgzip`) should be installed automatically and available on the PATH. You can verify that these dependencies are available by running [the check command](doc_check.md), i.e., `hatchet check`. - If HATCHet is installed from source, you may need to install them youself (i.e., via `conda` or from source) and/or specify their locations using the following arguments: + If HATCHet2 is installed from source, you may need to install them youself (i.e., via `conda` or from source) and/or specify their locations using the following arguments: | Name | Description | Usage | |------|-------------|-------| diff --git a/docs/source/doc_plot_bins.md b/docs/source/doc_plot_bins.md index a9f101f6..9c9136fb 100644 --- a/docs/source/doc_plot_bins.md +++ b/docs/source/doc_plot_bins.md @@ -75,7 +75,7 @@ The command generates a series of 2d-scatter plots where x-axis corresponds to t ### CBB -The command generates a series of 2d-scatter plots where x-axis corresponds to the mirrored BAF and the y-axis corresponds to RDR. More specifically, a plot is generated for every sample and every point is drawn according to the corresponding values of RDR and BAF. The points are colored according to the clusters computed by HATCHet. Remember that a cluster contains all the genomic regions that have the same copy-number state in every clone. +The command generates a series of 2d-scatter plots where x-axis corresponds to the mirrored BAF and the y-axis corresponds to RDR. More specifically, a plot is generated for every sample and every point is drawn according to the corresponding values of RDR and BAF. The points are colored according to the clusters computed by HATCHet2. Remember that a cluster contains all the genomic regions that have the same copy-number state in every clone. python -m hatchet plot-bins A12.bbc -c CBB --figsize 4,1.1 -m tab20 --markersize 1 --xmax 3.5 --xmin 0.5 --colwrap 3 -tS 0.005 @@ -85,7 +85,7 @@ The command generates a series of 2d-scatter plots where x-axis corresponds to t ### CRDR -The command generates a series of 2d-scatter plots where x-axis corresponds to the RDR and to bins sorted by chromosome and positions. More specifically, a plot is generated for every sample and every point is drawn according to the corresponding values of RDR. The points are colored according to the clusters computed by HATCHet. Remember that a cluster contains all the genomic regions that have the same copy-number state in every clone. +The command generates a series of 2d-scatter plots where x-axis corresponds to the RDR and to bins sorted by chromosome and positions. More specifically, a plot is generated for every sample and every point is drawn according to the corresponding values of RDR. The points are colored according to the clusters computed by HATCHet2. Remember that a cluster contains all the genomic regions that have the same copy-number state in every clone. python -m hatchet plot-bins A12.bbc -c CRD -m tab20 --figsize 9,2.5 --markersize 1 --ymax 8 --ymin 0 @@ -95,7 +95,7 @@ The command generates a series of 2d-scatter plots where x-axis corresponds to t ### CBAF -The command generates a series of 2d-scatter plots where x-axis corresponds to the BAF and to bins sorted by chromosome and positions. More specifically, a plot is generated for every sample and every point is drawn according to the corresponding values of BAF. The points are colored according to the clusters computed by HATCHet. Remember that a cluster contains all the genomic regions that have the same copy-number state in every clone. +The command generates a series of 2d-scatter plots where x-axis corresponds to the BAF and to bins sorted by chromosome and positions. More specifically, a plot is generated for every sample and every point is drawn according to the corresponding values of BAF. The points are colored according to the clusters computed by HATCHet2. Remember that a cluster contains all the genomic regions that have the same copy-number state in every clone. python -m hatchet plot-bins A12.bbc -c CBAF -m tab20 --figsize 9,2.5 --markersize 1 --ymax 8 --ymin 0 diff --git a/docs/source/doc_plot_cn_1d2d.md b/docs/source/doc_plot_cn_1d2d.md index a46cfa68..b2df5793 100644 --- a/docs/source/doc_plot_cn_1d2d.md +++ b/docs/source/doc_plot_cn_1d2d.md @@ -2,7 +2,7 @@ This step produces alternate plots that show bins in terms of their computed read-depth ratios (RDR), B-allele frequencies (BAF), and assigned copy-number states. These plots show bins colored by cluster, where the color is consistent between the "2D" (RDR x BAF) view and the "1D" (genomic location x RDR/BAF) view. -Additionally, the labeled points in the 2D plots and the black bars in the 1D plots show the *expected* positions of the assigned copy-number states (determined by the mixture proportions and fractional copy number scaling). These indicators can be used to evaluate the consistency of the HATCHet solution. +Additionally, the labeled points in the 2D plots and the black bars in the 1D plots show the *expected* positions of the assigned copy-number states (determined by the mixture proportions and fractional copy number scaling). These indicators can be used to evaluate the consistency of the HATCHet2 solution. When `plot_cn = True` is indicated in `hatchet.ini`, both this command and the command [plot-cn](doc_plot_cn.md) will be run. diff --git a/docs/source/doc_runhatchet.md b/docs/source/doc_runhatchet.md index 6522af5f..6be59988 100644 --- a/docs/source/doc_runhatchet.md +++ b/docs/source/doc_runhatchet.md @@ -55,7 +55,7 @@ Next, you need to specify the full paths to the required input data: 1. `normal` is the full path to the BAM file of matched-normal samples 2. `bams` is a white-space separated list of the BAM files for the multiple tumor samples from the considered patient. -The variable `samples` is also a white-space separated list of tumor sample names (specified in the same order as the BAM files in `bams`), and these names are used in the plots produced by HATCHet. +The variable `samples` is also a white-space separated list of tumor sample names (specified in the same order as the BAM files in `bams`), and these names are used in the plots produced by HATCHet2. *** @@ -119,9 +119,9 @@ python3 -m hatchet combine-counts -A ${RDR} -t ${RDR}total.tsv -b {BAF}tumor.1be -p ${PHASE} -s ${max_blocksize} -m {max_spb} -a {alpha} # optional phasing args ``` -combine-counts constructs genomic bins such that in all samples, each bin has at least `${msr}` SNP-covering reads and at least `${mtr}` total reads. Bins will not have the same width, but using this rule each bin will have comparable RDR and BAF signals for the following clustering steps. The BAF for each bin and the relative phase of all SNPs in the bin are inferred via EM, and the RDR for each bin is computed fom the read count files in the `${RDR}` folder. After this computation, RDR values are normalized using the total reads in each sample (from `${RDR}total.tsv`). As with other HATCHet commands, `-j ${J}` controls the number of parallel processes. +combine-counts constructs genomic bins such that in all samples, each bin has at least `${msr}` SNP-covering reads and at least `${mtr}` total reads. Bins will not have the same width, but using this rule each bin will have comparable RDR and BAF signals for the following clustering steps. The BAF for each bin and the relative phase of all SNPs in the bin are inferred via EM, and the RDR for each bin is computed fom the read count files in the `${RDR}` folder. After this computation, RDR values are normalized using the total reads in each sample (from `${RDR}total.tsv`). As with other HATCHet2 commands, `-j ${J}` controls the number of parallel processes. -See the [script](script/README.md) directory for a guide on how to run HATCHet with phasing. If a phased VCF file is supplied via `-p, --phase ${PHASE}` (e.g., `-p phase/phased.vcf.gz`), SNPs are merged into blocks before BAF inference. Each block contains at most `${max_spb}` such that no two SNPs in the same block are further apart than `${max_blocksize}`, and such that no two adjacent SNPs have significantly different marginal BAF estimates (at significance level `${alpha}` -- higher `${alpha}` corresponds to less trust in the phasing results). Then, blocks are passed to the EM which determines the relative phase of each block. +See the [script](script/README.md) directory for a guide on how to run HATCHet2 with phasing. If a phased VCF file is supplied via `-p, --phase ${PHASE}` (e.g., `-p phase/phased.vcf.gz`), SNPs are merged into blocks before BAF inference. Each block contains at most `${max_spb}` such that no two SNPs in the same block are further apart than `${max_blocksize}`, and such that no two adjacent SNPs have significantly different marginal BAF estimates (at significance level `${alpha}` -- higher `${alpha}` corresponds to less trust in the phasing results). Then, blocks are passed to the EM which determines the relative phase of each block. ## [cluster-bins](doc_cluster_bins.html) @@ -155,7 +155,7 @@ python3 -m hatchet plot-bins -c CBB ../${BBC}bulk.bbc -tS 0.01 ``` plot-bins produces informative plots which are described [here](doc_plot_bins.md). -Many of these plots can be very useful to assess the performance of the various steps of HATCHet, especially in the case of noisy datasets. +Many of these plots can be very useful to assess the performance of the various steps of HATCHet2, especially in the case of noisy datasets. ## [compute-cn](doc_compute_cn.html) diff --git a/docs/source/index.rst b/docs/source/index.rst index 1a17b075..82c8dcde 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,7 +1,7 @@ -HATCHet +HATCHet2 ======= -HATCHet is an algorithm to infer allele and clone-specific copy-number aberrations (CNAs), clone proportions, and whole-genome duplications (WGD) for several tumor clones jointly from multiple bulk-tumor samples of the same patient or from a single bulk-tumor sample. +HATCHet2 is an algorithm to infer allele and clone-specific copy-number aberrations (CNAs), clone proportions, and whole-genome duplications (WGD) for several tumor clones jointly from multiple bulk-tumor samples of the same patient or from a single bulk-tumor sample. .. toctree:: diff --git a/docs/source/recommendation_clustering.md b/docs/source/recommendation_clustering.md index aa7619ac..9a28a854 100644 --- a/docs/source/recommendation_clustering.md +++ b/docs/source/recommendation_clustering.md @@ -1,6 +1,6 @@ # Analyze global clustering -The global clustering performed along the genome and jointly across samples is a crucial feature of HATCHet and the quality of the final results is strongly affected by the quality of the clustering. This global clustering is performed by HATCHet's component `cluster-bins`, whose default values are suitable for many datasets. However, for ideal results on specific datasets these parameters may need to be modified. +The global clustering performed along the genome and jointly across samples is a crucial feature of HATCHet2 and the quality of the final results is strongly affected by the quality of the clustering. This global clustering is performed by HATCHet2's component `cluster-bins`, whose default values are suitable for many datasets. However, for ideal results on specific datasets these parameters may need to be modified. The module `cluster-bins` incorporates genomic position to improve clustering using a Gaussian hidden Markov model (GHMM), as opposed to the position-agnostic Gaussian mixture model (GMM) used in `cluster-bins-gmm` and described in the original HATCHet publication. This page describes how to tune the parameters of `cluster-bins` -- for recommendations on `cluster-bins-gmm`, see [this page](recommendation_old_clustering.md) instead. diff --git a/docs/source/recommendation_datatype.md b/docs/source/recommendation_datatype.md index 69deed03..cdb42123 100644 --- a/docs/source/recommendation_datatype.md +++ b/docs/source/recommendation_datatype.md @@ -1,6 +1,6 @@ # Analyze different type of data -The default values in the complete pipeline of HATCHet are typically used for analyzing whole-genome sequencing (WGS) data. However, when considering different type of data, as those from whole-exome sequencing (WES) data, users should adjust some of the parameters due to the different features of this kind of data. More specifically, there are 4 main points to consider when analyzing WES data: +The default values in the complete pipeline of HATCHet2 are typically used for analyzing whole-genome sequencing (WGS) data. However, when considering different type of data, as those from whole-exome sequencing (WES) data, users should adjust some of the parameters due to the different features of this kind of data. More specifically, there are 4 main points to consider when analyzing WES data: - *Bin sizes*. One can use the plots from [plot-bins](https://github.com/raphael-group/hatchet/blob/master/doc/doc_plot_bins.html) to test different parameters (`--mtr` and `--msr` for variable-width, bin size for fixed width) and inspect the amount of variance and/or the separation between apparent clusters. * **Variable-width** Having a sufficient number of germline SNPs is needed to have good estimations with low variances for RDR and, especially, for the B-allele frequency (BAF) of each bin. Variable-width binning attempts to account for this by adjusting bin widths to ensure enough total and SNP-covering reads in each bin. You can tune the average bin width using the `--msr` (min. SNP-covering reads, default 5000) and `--mtr` (min. total reads, default 5000) parameters to `combine-counts`. Generally, `--msr` is more important because a bin with enough SNP-covering reads to get a good BAF estimate will almost certainly have enough total reads to get a good RDR estimate. Increasing these parameters produces larger bins (on average) with lower variance, while decreasing these values produces smaller bins (on average) with higher variance. diff --git a/docs/source/recommendation_inference.md b/docs/source/recommendation_inference.md index 07957327..44804410 100644 --- a/docs/source/recommendation_inference.md +++ b/docs/source/recommendation_inference.md @@ -1,6 +1,6 @@ -# Analyze HATCHet's inference +# Analyze HATCHet2's inference -The main component of HATCHet ([compute-cn](doc_compute_cn.md) step) performs three major tasks: (1) explicit estimation of fractional copy numbers, (2) inference of allele and clone-specific copy numbers, and (3) joint prediction of number of clones and whole-genome duplication. In the following, we guide the user in the interpration of HATCHet's inference, we explain how to perform quality control to guarantee the best-quality results, and we describe how the user can control and tune some of the parameters to obtain the best-fitting results. In fact, it is important to assess the quality of the results from each of these steps to guarantee the best-quality results, especially when considering datasets with high noise or special features. +The main component of HATCHet2 ([compute-cn](doc_compute_cn.md) step) performs three major tasks: (1) explicit estimation of fractional copy numbers, (2) inference of allele and clone-specific copy numbers, and (3) joint prediction of number of clones and whole-genome duplication. In the following, we guide the user in the interpration of HATCHet2's inference, we explain how to perform quality control to guarantee the best-quality results, and we describe how the user can control and tune some of the parameters to obtain the best-fitting results. In fact, it is important to assess the quality of the results from each of these steps to guarantee the best-quality results, especially when considering datasets with high noise or special features. 1. [Estimation of fractional copy numbers](#estimation) 2. [Inference of allele and clone-specific copy numbers](#inference) @@ -10,11 +10,11 @@ The main component of HATCHet ([compute-cn](doc_compute_cn.md) step) performs th ## 1. Estimation of fractional copy numbers -HATCHet estimates the fractional copy numbers for all segments by identifying 1 or 2 tumor clonal clusters (i.e, a cluster which have the same CNA in all tumor clones). First, HATCHet selects only some of the clusters as potential clonal clusters and, next, it aims to find consistent combinations of clonal clusters. +HATCHet2 estimates the fractional copy numbers for all segments by identifying 1 or 2 tumor clonal clusters (i.e, a cluster which have the same CNA in all tumor clones). First, HATCHet2 selects only some of the clusters as potential clonal clusters and, next, it aims to find consistent combinations of clonal clusters. #### Selecting potential clonal clusters -HATCHet selects potential clonal clusters as sufficiently large clusters; the list of the selected clusters is reported in the LOG of the `HATCHet` step starting with line `# Selected clusters`. This list reports the cluster size and the corresponding pairs of values (RDR, BAF) in each sample (alphabetically sorted) and, therefore, can be easily used to map these clusters to the `bb_clustered.pdf` figure generated by the command `CBB` of `plot-bins` or the `plot-bins-1d2d` command. For example, we have the following: +HATCHet2 selects potential clonal clusters as sufficiently large clusters; the list of the selected clusters is reported in the LOG of the `HATCHet2` step starting with line `# Selected clusters`. This list reports the cluster size and the corresponding pairs of values (RDR, BAF) in each sample (alphabetically sorted) and, therefore, can be easily used to map these clusters to the `bb_clustered.pdf` figure generated by the command `CBB` of `plot-bins` or the `plot-bins-1d2d` command. For example, we have the following: # Selected clusters: 11, 10, 12, 15, 21, 22, 23 ## Features of selected clusters: @@ -36,7 +36,7 @@ When using the default value of `-ts 0.008`, the cluster `2` will be excluded as #### Identifying combinations of clonal clusters -HATCHet chooses the largest combination of consistent clonal clusters to obtain the needed clonal cluster and to estimate the fractional copy numbers, when assuming the occurrence of a WGD. The combinations considered by HATCHet are reported in the LOG of the `HATCHet` step, after the inferences with the absence of a WGD (`# Running diploid`) and starting with line `# Finding clonal clusters and their copy numbers`. Each combination (also called pattern) is reported by first specifying the total size of the involved clusters and next the copy number state of each cluster. For example, we have the following: +HATCHet2 chooses the largest combination of consistent clonal clusters to obtain the needed clonal cluster and to estimate the fractional copy numbers, when assuming the occurrence of a WGD. The combinations considered by HATCHet2 are reported in the LOG of the `HATCHet2` step, after the inferences with the absence of a WGD (`# Running diploid`) and starting with line `# Finding clonal clusters and their copy numbers`. Each combination (also called pattern) is reported by first specifying the total size of the involved clusters and next the copy number state of each cluster. For example, we have the following: # Finding clonal clusters and their copy numbers ## Found pattern of size 1080892751.0: {'8': (2, 1), '28': (3, 2), '50': (2, 2), '29': (4, 2), '23': (2, 0)} @@ -61,22 +61,22 @@ because in the `bb_clustered.pdf` figure there is another clear cluster `2` whic ## 2. Inference of allele and clone-specific copy numbers -HATCHet infers allele and clone-specific copy numbers by first assuming the absence of a WGD (`# Running diploid`) and, next, by assuming the presence of a WGD (`# Running tetraploid`). In each of the two cases, HATCHet considers an increasing number of clones (including the normal diploid clone). Typically, the interval starts from 2 (i.e. 1 tumor clone) up to 6-8 clones which is the typical maximum number of clones in terms of CNAs; however, the user can consider larger numbers, especially when suspecting the presence of more clones, by controlling the parameter `-n`, e.g. typical intervals are `-n 2,6`, `-n 2,8`, `-n 2,10`, `-n 2,12`, ... +HATCHet2 infers allele and clone-specific copy numbers by first assuming the absence of a WGD (`# Running diploid`) and, next, by assuming the presence of a WGD (`# Running tetraploid`). In each of the two cases, HATCHet2 considers an increasing number of clones (including the normal diploid clone). Typically, the interval starts from 2 (i.e. 1 tumor clone) up to 6-8 clones which is the typical maximum number of clones in terms of CNAs; however, the user can consider larger numbers, especially when suspecting the presence of more clones, by controlling the parameter `-n`, e.g. typical intervals are `-n 2,6`, `-n 2,8`, `-n 2,10`, `-n 2,12`, ... #### Maximum copy numbers and minimum clone proportion -For each assumpotion and number of clones, HATCHet infers allele and clone-specific copy-numbers by using two main parameters, the *maximum copy numbers (`-eD` and `-eT`)* and the *minimum clone proportion (`-u`)*. These parameters are very important to constrain the solution space depending on the needs of the user, in particular the minimum clone proportion `-u` is particularly important to deal with noisy datasets and we reccomend the user to follow the criterion described here below: +For each assumpotion and number of clones, HATCHet2 infers allele and clone-specific copy-numbers by using two main parameters, the *maximum copy numbers (`-eD` and `-eT`)* and the *minimum clone proportion (`-u`)*. These parameters are very important to constrain the solution space depending on the needs of the user, in particular the minimum clone proportion `-u` is particularly important to deal with noisy datasets and we reccomend the user to follow the criterion described here below: - **Maximum copy numbers (`-eD` and `-eT`)**: these values define the maximum value of the total copy number when assuming the absence of a WGD (`-eD`) and the presence of a WGD (`-eT`). By default the value of these parameters is `0` meaning that these values are inferred from the estimated fractional copy numbers. However, we recommend the user to try to use common values, especially in the first attempts, (e.g. 5, 6, 8 for `-eD` and 8, 10, 12 for `eT`) to constraint the searching space and avoid that noisy clusters introduce outlying values. - **Minimum clone proportion `-u`**: this value defines the minimum clone proportion for the inferred tumor clones. Default values are pretty small, e.g. 0.03 or 0.05, and allow to infer tumor clones present in small proportions. However, the power of the inference and the capability of inferring tumor clones with small proportions strongly depend on the noise of the data. As such, especially for noisy and special datasets it is very important that the user considers higher values when possible, e.g. 0.1-0.15. We reccomend to consider the following criterion to avoid overfitting and find the best fitting value for `-u`: **start from very small minimum clone proprtions and increase the value whenever the inferred solutions contain clones with clone proportions equal to this threshold**. ## 3. Joint selection of number of clones and whole-genome duplication -For each number of clones N, the related copy numbers are computed by HATCHet and placed in two output files: +For each number of clones N, the related copy numbers are computed by HATCHet2 and placed in two output files: - A `BBC-UCN` file which adds to the input `BBC` file the copy-number state and proportion inferred for each clone (the normal clone is always the first). The file is named `results.diploid.nN.bbc.ucn.tsv` or `results.tetraploid.nN.bbc.ucn.tsv` for the solutions obtained when considering the absence or presence of a WGD, respectively. - A `SEG-UCN` file which combines neighboring bins with the same copy-number states into segments and for each segment it reports the copy-number state and proportion of each clone. The file is named `results.diploid.nN.seg.ucn.tsv` or `results.tetraploid.nN.seg.ucn.tsv` for the solutions obtained when considering the absence or presence of a WGD, respectively. -HATCHet selects the best solution under each of the two assumptions through a model selection criterion; as such, it copies the best diploid solutions to the files `chosen.diploid.bbc.ucn` and `chosen.diploid.seg.ucn`, and the best tetraploid solution to the two files `chosen.tetraploid.bbc.ucn` and `chosen.tetraploid.seg.ucn`. Each choice is based on the value of an elbow function (called `score`) computed for each number of clones. This value approximates the second derivative of the factorization objective function (called `OBJ`) and the choice is based on the maximum score; as such **the best number of clones is chosen as the number which significantly improves the objective function with respect to lower number of clones but no significant subsequent improvements are observed with increasing number of clones**. All these values are summarized in the LOG of the HATCHet step by firts summarizing the values for diploid solutions and next those of tetraploid solutions, e.g. +HATCHet2 selects the best solution under each of the two assumptions through a model selection criterion; as such, it copies the best diploid solutions to the files `chosen.diploid.bbc.ucn` and `chosen.diploid.seg.ucn`, and the best tetraploid solution to the two files `chosen.tetraploid.bbc.ucn` and `chosen.tetraploid.seg.ucn`. Each choice is based on the value of an elbow function (called `score`) computed for each number of clones. This value approximates the second derivative of the factorization objective function (called `OBJ`) and the choice is based on the maximum score; as such **the best number of clones is chosen as the number which significantly improves the objective function with respect to lower number of clones but no significant subsequent improvements are observed with increasing number of clones**. All these values are summarized in the LOG of the HATCHet2 step by firts summarizing the values for diploid solutions and next those of tetraploid solutions, e.g. ## Scores approximating second derivative for diploid results ## Diploid with 2 clones - OBJ: 66.804147 - score: -0.176997273837 @@ -95,8 +95,8 @@ HATCHet selects the best solution under each of the two assumptions through a mo ## Tetraploid with 7 clones - OBJ: 8.022994 - score: 0.104757566866 ## Tetraploid with 8 clones - OBJ: 7.521881 - score: -0.237540399507 -The user should analyze the scores as HATCHet provides the following two parameters (controlling different hypotheses) to investigate alternative solutions with also high scores: -- **Sensitivity to small CNAs `-l`**: this parameter controls the sensitivity of HATCHet to small CNAs, whose typical values are 0.5-0.6. The user should decrease this value, e.g. 0.2-0.3, to investigate solutions with more clones and smaller CNAs, while it should increase the value, e.g. 1.0-1.5, to give more confidence to large CNAs and less to small CNAs. +The user should analyze the scores as HATCHet2 provides the following two parameters (controlling different hypotheses) to investigate alternative solutions with also high scores: +- **Sensitivity to small CNAs `-l`**: this parameter controls the sensitivity of HATCHet2 to small CNAs, whose typical values are 0.5-0.6. The user should decrease this value, e.g. 0.2-0.3, to investigate solutions with more clones and smaller CNAs, while it should increase the value, e.g. 1.0-1.5, to give more confidence to large CNAs and less to small CNAs. - **Confidence in a single tumor clone `-g`**: this parameter controls the confidence in the presence of a single tumor clone, whose typical values are 0.2-0.3. The user should increase the value to increase the confidence, e.g. 0.4-0.5, in the presence of a single tumor clone, while lower values, e.g. 0.0-0.1, decrease the confidenze and favor the presence of more clones. The value should be increase especially in 2 cases: (1) when the score of 2 clones is particularly high with or without a WGD (e.g. a value close to 0.0), and especially (2) when the score of 2 clones is significantly higher with a WGD than the score of 2 clones without a WGD; the latter may indeed indicate the presence of a single tetraploid clone. The final best solution, according to prediction of the presence or absence of a WGD, is made based on a trade-off between the number of clones and WGD; more specifically, the diploid solution is chosen when it has the same or lower number of clones than the tetraploid solution, otherwise the tetraploid solution is chosen. @@ -109,13 +109,13 @@ It is very important that the user verifies the results in different steps to gu - User can assess the inferred copy numbers by analyzing the inferred maximum values and the inferred clone proportions which define the tumor clonal composition. - User can assess the joint inference of number of clones and WGD by analyzing the values of the objective function and the related scores. -There are some typical suspicious and warning cases that the user can identify from the analysis of the LOG of the `HATCHet` step: +There are some typical suspicious and warning cases that the user can identify from the analysis of the LOG of the `HATCHet2` step: - Many diploid solutions have high scores. -- Objective function of tetraploid solutions (i.e. with WGD) does not almost decrease/vary when increasing the number of clones. Even if this can occur because a single tetraploid tumor clone is present, this typically occurrs when the heuristic of HATCHet failed to identify a correct clonal cluster. This case is even more suspicious when the chosen number of diploid clones is much higher or when the objective function of tetraploid solutions is hugely higher that those of diploid solutions. +- Objective function of tetraploid solutions (i.e. with WGD) does not almost decrease/vary when increasing the number of clones. Even if this can occur because a single tetraploid tumor clone is present, this typically occurrs when the heuristic of HATCHet2 failed to identify a correct clonal cluster. This case is even more suspicious when the chosen number of diploid clones is much higher or when the objective function of tetraploid solutions is hugely higher that those of diploid solutions. - Inferred clone proportions are identical to the minimum clone proportion `-u` and tumor clones are present in all samples with very small proportions. Also, higher maximum copy numbers are needed when these result in much lower objective functions. - Huge difference between the number of clones inferred with and without a WGD, especially when the chosen diploid solution has a much lower number of clones than the chosen tetraploid solution. - Objective function that keeps decreasing significantly and objective function with very small values. -- Score of 2 clones with a WGD is much higher then score of 2 clones without a WGD. This typically requires to increase the single-clone confidence of HATCHet `-g` to investigate the presence of a single tumor clone. +- Score of 2 clones with a WGD is much higher then score of 2 clones without a WGD. This typically requires to increase the single-clone confidence of HATCHet2 `-g` to investigate the presence of a single tumor clone. The user can consider the following parameters to investigate alternative solutions and better fitting: - Sensitivity `-l` controls how much the variance of data and clusters influence the choice of the number of clones. The user should **increase the sensitivity (by decreasing the value of `-l` to 0.4, or 0.45, 0.5, or 0.3) when having high-purity or low variance samples** to better investigate multiple solutions, especially when there are multiple solutions with higher scores (and especially when there are many diploid solutions with high scores) and when considering small CNAs. Conversely, the user should **decrease the sensitivity (by increasing -l to 0.6, 0.8, 1.0)** when considering low-purity or high-variance samples. diff --git a/docs/source/recommendation_runtime.md b/docs/source/recommendation_runtime.md index ab67cf8b..cb4465ce 100644 --- a/docs/source/recommendation_runtime.md +++ b/docs/source/recommendation_runtime.md @@ -4,4 +4,4 @@ This section includes a collection of several tips for improving the overall run ## 1. SNP calling from known database -HATCHet allows to provide to count-alleles a list of known germline SNPs. This allows to significantly improve the performance. However, running count-alleles without this list (as by default behaviour) results in count-alleles calling germline SNPs along the whole genome and identifying more SNPs (especially including private and rare germline SNPs), which could result in a higher total number of SNPs improving quality of BAF estimations. The user can consider this trade-off. +HATCHet2 allows to provide to count-alleles a list of known germline SNPs. This allows to significantly improve the performance. However, running count-alleles without this list (as by default behaviour) results in count-alleles calling germline SNPs along the whole genome and identifying more SNPs (especially including private and rare germline SNPs), which could result in a higher total number of SNPs improving quality of BAF estimations. The user can consider this trade-off. diff --git a/examples/demo-WES/demo-wes.sh b/examples/demo-WES/demo-wes.sh index c505047b..f353e1a4 100644 --- a/examples/demo-WES/demo-wes.sh +++ b/examples/demo-WES/demo-wes.sh @@ -1,20 +1,20 @@ # Demo for WES data from a cancer patient : ex: set ft=markdown ;:<<'```shell' # -**NOTE**: this demo has not yet been updated for version 1.0 of HATCHet which includes variable-width binning, phasing, and locality-aware clustering. +**NOTE**: this demo has not yet been updated for version 1.0 of HATCHet22 which includes variable-width binning, phasing, and locality-aware clustering. -The following HATCHet's demo represents a guided example starting from WES (whole-exome sequencing) data from 2 samples of the same patient. WES data are an interesting case to consider as they are typically characterize by a larger variance, especially for RDR. For simplicity, the demo starts from a BB file `demo-wes.bb` (included in this demo at `examples/demo-WES/`) which contains the RDR and BAF of every genomic bin and, therefore, we assume that the preliminary steps (i.e. count-reads, count-alleles, and combine-counts) have already been executed by running standard configuration for WES data (bin size of 250kb through -b 250kb of count-reads, and the allele counts for germline heterozygous SNPs have been selected between 30 and 400 through `-c 30 -C 400` of `count-alleles` as the average coverage is 180x). +The following HATCHet22's demo represents a guided example starting from WES (whole-exome sequencing) data from 2 samples of the same patient. WES data are an interesting case to consider as they are typically characterize by a larger variance, especially for RDR. For simplicity, the demo starts from a BB file `demo-wes.bb` (included in this demo at `examples/demo-WES/`) which contains the RDR and BAF of every genomic bin and, therefore, we assume that the preliminary steps (i.e. count-reads, count-alleles, and combine-counts) have already been executed by running standard configuration for WES data (bin size of 250kb through -b 250kb of count-reads, and the allele counts for germline heterozygous SNPs have been selected between 30 and 400 through `-c 30 -C 400` of `count-alleles` as the average coverage is 180x). ## Requirements and set up -The demo requires that HATCHet has been successfully compiled and all the dependencies are available and functional. As such, the demo requires the user to properly set up the following paths: +The demo requires that HATCHet22 has been successfully compiled and all the dependencies are available and functional. As such, the demo requires the user to properly set up the following paths: ```shell PY="python3" # This is the full path to the version of PYTHON3 which contains the required `hatchet` module. When this corresponds to the standard version, the user can keep the given value of `python3` :<<'```shell' # Ignore this line ``` -The following paths are consequently obtained to point to the required components of HATCHet +The following paths are consequently obtained to point to the required components of HATCHet22 ```shell CLUSTERBINS="${PY} -m hatchet cluster-bins" @@ -35,14 +35,14 @@ PS4='[\t]' ## Global clustering -The first main step of the demo performs the global clustering of HATCHet where genomic bins which have the same copy-number state in every tumor clone are clustered correspondingly. To do this, we use `cluster-bins`, i.e. the HATCHet's component designed for this purpose. At first, we attempt to run the clustering using the default values of the parameters as follows: +The first main step of the demo performs the global clustering of HATCHet22 where genomic bins which have the same copy-number state in every tumor clone are clustered correspondingly. To do this, we use `cluster-bins`, i.e. theHATCHet2t2's component designed for this purpose. At first, we attempt to run the clustering using the default values of the parameters as follows: ```shell ${CLUSTERBINS} demo-wes.bb -o demo-wes.seg -O demo-wes.bbc -e 12 -tB 0.03 -tR 0.15 -d 0.08 :<<'```shell' # Ignore this line ``` -For different type of data it is essential to assess the quality of the clustering because this is performed by a Dirichlet process and it is affected by varying degrees of noise. This assesment is particularly important in the case of WES data where the variance is higher than expected, especially for RDR; in fact we often observe that the clusters are much wider in terms of RDR (x-axis) and tend to have a *disc* shape rather than the expected *oval* shape. To do this, we use `plot-bins`, i.e. the HATCHet's component designed for the analysis of the data, and produce the cluster plot using the `CBB` command. To help we use the following options: +For different type of data it is essential to assess the quality of the clustering because this is performed by a Dirichlet process and it is affected by varying degrees of noise. This assesment is particularly important in the case of WES data where the variance is higher than expected, especially for RDR; in fact we often observe that the clusters are much wider in terms of RDR (x-axis) and tend to have a *disc* shape rather than the expected *oval* shape. To do this, we use `plot-bins`, i.e. the HATCHet22's component designed for the analysis of the data, and produce the cluster plot using the `CBB` command. To help we use the following options: - `--xmin 0` and `--xmax 2` allow to zoom in and to focus the figure on the same RDR (y-axis) range for every sample. - `-tS 0.005` asks to plot only the clusters which cover at least the `0.5%` of the genome. This is useful to clean the figure and focus on the main components. To trace all steps, we also move the figure to `tR015-cbb.pdf`. @@ -82,9 +82,9 @@ In this clustering the previously-described condition is met and all the differe ## hatchet's step -In the last step we apply `hatchet`, i.e. the component of HATCHet which estimates fractional copy numbers, infers allele-and-clone specific copy numbers, and jointly predicts the number of clones (including the normal clone) and the presence of a WGD. +In the last step we apply `hatchet`, i.e. the component of HATCHet22 which estimates fractional copy numbers, infers allele-and-clone specific copy numbers, and jointly predicts the number of clones (including the normal clone) and the presence of a WGD. We apply the last step with default parameters and, for simplicity of this demo, we apply only few changes: -- As the dataset has high variance and noise (see clustering), we consider a minimum clone proportion `-u` slightly higher than the default value, i.e. `6%`. We do this because we cannot infer tumor clones with very low proportions when there is high noise and because potential clones inferred with very low proportions may simply be the result of overfitting. In fact, when using values of `-u` smaller than `6%` we obtain solutions with clone proporions identical to the minimum value of `-u`; this is the recommended criterion to determine the need of increasing the value of `-u`. Interestingly, we can observe the same overfitting sign when we consider too high values of the minimum clone proportion, for example `-u 0.1`. This happens because the value is too high to fit the given data. As such, it is always important to choose the minimum value which provides "non-overfitting" results, i.e. results where the clone proportions are not identical to the minimum. When this is not possible, as in very noisy datasets, we reccommend to either tune the clustering or keeping very low values of the minimum clone proportion, as HATCHet is still able to recover the main clonal composition even in the presence of mninor overfitting. +- As the dataset has high variance and noise (see clustering), we consider a minimum clone proportion `-u` slightly higher than the default value, i.e. `6%`. We do this because we cannot infer tumor clones with very low proportions when there is high noise and because potential clones inferred with very low proportions may simply be the result of overfitting. In fact, when using values of `-u` smaller than `6%` we obtain solutions with clone proporions identical to the minimum value of `-u`; this is the recommended criterion to determine the need of increasing the value of `-u`. Interestingly, we can observe the same overfitting sign when we consider too high values of the minimum clone proportion, for example `-u 0.1`. This happens because the value is too high to fit the given data. As such, it is always important to choose the minimum value which provides "non-overfitting" results, i.e. results where the clone proportions are not identical to the minimum. When this is not possible, as in very noisy datasets, we reccommend to either tune the clustering or keeping very low values of the minimum clone proportion, as HATCHet22 is still able to recover the main clonal composition even in the presence of mninor overfitting. - We limit the number of clones to 6 for simplicity of this demo and because it is a reasonable value for CNAs when consider only few samples from the same patient. - We only consider 100 restarts for the coordinate-descent method; these are the number of attempts to find the best solution. This number is sufficient in this small example but we reccommend to use at least 400 restarts in standard runs. @@ -113,12 +113,12 @@ We obtain the following summary of results: ## The related-tetraploid resulting files are copied to ./chosen.tetraploid.bbc.ucn and ./chosen.tetraploid.seg.ucn # The chosen solution is diploid with 4 clones and is written in ./best.bbc.ucn and ./best.seg.ucn -HATCHet predicts the presence of 4 clones in the 3 tumor samples with no WGD and, especially, predicts that each sample contains two distinct tumor clones while sharing one of this. As there are inferred tumor clones with small clone proportions, there are only 2 samples, and the objective function does not significantly decrease after the chosen number of clones, there is no need to investigate the results of HATCHet by increasing the sensitivity with lower values of `-l`. However, the user could investigate the results of HATCHet when considering a lower sensitivity to small CNAs by considering higher values of `-l`, e.g. `-l 0.6` or `-l 0.8`; this choice would be indeed motivated by the high noise of the dataset. +HATCHet22 predicts the presence of 4 clones in the 3 tumor samples with no WGD and, especially, predicts that each sample contains two distinct tumor clones while sharing one of this. As there are inferred tumor clones with small clone proportions, there are only 2 samples, and the objective function does not significantly decrease after the chosen number of clones, there is no need to investigate the results ofHATCHet2t2 by increasing the sensitivity with lower values of `-l`. However, the user could investigate the results oHATCHet2et2 when considering a lower sensitivity to small CNAs by considering higher values of `-l`, e.g. `-l 0.6` or `-l 0.8`; this choice would be indeed motivated by the high noise of the dataset. ## Analyzing inferred results -Finally, we obtain useful plots to summarize and analyze the inferred results by using `plot-cn`, which is the last component of HATCHet. As WES data have fewer point covering the genome, we slightly change the resolution of the plots by asking to obtain genomic regions merging fewer genomic bins through `-rC 10 -rG 1`. As such, we run `plot-cn` as follows +Finally, we obtain useful plots to summarize and analyze the inferred results by using `plot-cn`, which is the last component of HATCHet22. As WES data have fewer point covering the genome, we slightly change the resolution of the plots by asking to obtain genomic regions merging fewer genomic bins through `-rC 10 -rG 1`. As such, we run `plot-cn` as follows ```shell ${PLOTCN} best.bbc.ucn -rC 10 -rG 1 diff --git a/examples/demo-complete/demo-complete.sh b/examples/demo-complete/demo-complete.sh index 6b632bb3..f45b5c34 100644 --- a/examples/demo-complete/demo-complete.sh +++ b/examples/demo-complete/demo-complete.sh @@ -1,14 +1,14 @@ -# Demo complete for the entire HATCHet pipeline +# Demo complete for the entire HATCHet2 pipeline : ex: set ft=markdown ;:<<'```shell' # -The following HATCHet demo represents a guided example of the complete HATCHet pipeline starting from an exemplary dataset of tumour and matched normal +The following HATCHet22 demo represents a guided example of the completeHATCHet2t2 pipeline starting from an exemplary dataset of tumour and matched normal [BAM files](https://doi.org/10.5281/zenodo.4046906) publicly available. From this directory, simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented. ## Requirements and set up -The demo requires that HATCHet has been succesfully installed in the current python environment. +The demo requires that HATCHet22 has been succesfully installed in the current python environment. Please make sure that you can succesfully run the required dependencies `samtools`, `bcftools`, `tabix`, and `mosdepth`. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied. @@ -68,9 +68,9 @@ samtools dict data/hg19.fa > data/hg19.dict :<<'```shell' # Ignore this line ``` -## Configuring the HATCHet's execution +## Configuring the HATCHet22's execution -We follow the template of the HATCHet's [script](../../doc/doc_fullpipeline.md#fullpipelineandtutorial). +We follow the template of the HATCHet22's [script](../../doc/doc_fullpipeline.md#fullpipelineandtutorial). 1. We specify the correct path to the reference genome and the output folder, and other required flags ```shell @@ -137,7 +137,7 @@ echo 'mtr=5000' >> hatchet.ini :<<'```shell' # Ignore this line ``` -## Running HATCHet +## Running HATCHet22 ```shell python -m hatchet run hatchet.ini diff --git a/script/README.md b/script/README.md index 677763b1..7bfe8b5f 100644 --- a/script/README.md +++ b/script/README.md @@ -1,6 +1,6 @@ -# Command for running the HATCHet workflow +# Command for running the HATCHet2 workflow -The entire end-end `HATCHet` pipeline can be run by using the `hatchet run` command. This command requires an *ini-file* +The entire end-end `HATCHet2` pipeline can be run by using the `hatchet run` command. This command requires an *ini-file* from which it gets its configuration values. A sample [hatchet.ini](https://raw.githubusercontent.com/raphael-group/hatchet/master/script/hatchet.ini) file is provided in this folder for you to get started. You can name this file anything you want and specify it during `hatchet run`, but we @@ -19,7 +19,7 @@ bams = "/path/to/tumor1.bam /path/to/tumor2.bam" samples = "Primary Met" ``` -Optionally, if you wish to run the HATCHet pipeline only on select chromosome(s), specify their name(s) under the +Optionally, if you wish to run the HATCHet2 pipeline only on select chromosome(s), specify their name(s) under the 'chromosomes' key, separated by whitespace. For example: ``` @@ -27,16 +27,16 @@ chromosomes = chr21 chr22 ``` This can be very useful when trying to validate your pipeline relatively quickly before running it on all chromosomes. -As an example, this should be set to `chr22` for [HATCHet Demo data](https://zenodo.org/record/4046906). +As an example, this should be set to `chr22` for [HATCHet2 Demo data](https://zenodo.org/record/4046906). To run the pipeline on all chromosomes, leave the key blank. ``` chromosomes = ``` -## Run HATCHet without phasing +## Run HATCHet2 without phasing -Use the following command To run HATCHet without phasing: +Use the following command To run HATCHet2 without phasing: ``` hatchet run hatchet.ini @@ -45,12 +45,12 @@ hatchet run hatchet.ini As explained above, you can leave all values to their defaults, but you will want to override the `reference`, `normal`, `bams` and `samples` values in the ini file. -## Run HATCHet with phasing +## Run HATCHet2 with phasing -Running HATCHet with phasing is currently a two part process. It's a little more labor intensive but may produce cleaner +Running HATCHet2 with phasing is currently a two part process. It's a little more labor intensive but may produce cleaner results. -First run `hatchet run hatchet.ini`, but **enable only the first 3 steps** of the HATCHet pipeline in `hatchet.ini`: +First run `hatchet run hatchet.ini`, but **enable only the first 3 steps** of the HATCHet2 pipeline in `hatchet.ini`: ``` genotype_snps = True @@ -72,7 +72,7 @@ Michigan imputation server: 1. You may have to use `bcftools annotate` to convert between chromosome names (e.g. chr20 -> 20) 2. Results are always returned in hg19 coordinates, so you may need to convert coordinates back to hg38 using e.g. Picard's [LiftoverVcf](https://broadinstitute.github.io/picard/command-line-overview.html#LiftoverVcf) -3. The by-chromosome phased VCF files you receive must be combined with the `bcftools concat` command to give HATCHet a +3. The by-chromosome phased VCF files you receive must be combined with the `bcftools concat` command to give HATCHet2 a single phased VCF file. Also in `hatchet.ini`, under the `combine_counts` section is a `blocklength` parameter, which is the haplotype block @@ -81,8 +81,8 @@ haplotype block sizes allow you to combine more SNPs, the accuracy of phasing de see this [paper](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1007308) comparing various phasing methods). -Then, run the HATCHet workflow again using `hatchet run hatchet.ini`, after enabling only the remaining steps of -the HATCHet pipeline. This should have a shorter runtime than when you ran the first 3 steps: +Then, run the HATCHet2 workflow again using `hatchet run hatchet.ini`, after enabling only the remaining steps of +the HATCHet2 pipeline. This should have a shorter runtime than when you ran the first 3 steps: ``` genotype_snps = False diff --git a/script/hatchet.ini b/script/hatchet.ini index 1a3dcde3..3461b63a 100644 --- a/script/hatchet.ini +++ b/script/hatchet.ini @@ -1,5 +1,5 @@ [run] -# What individual steps of HATCHet should we run in the pipeline? +# What individual steps of HATCHet2 should we run in the pipeline? # Valid values are True or False download_panel = True count_reads = True @@ -54,7 +54,7 @@ mincov = 8 # Use 300 for WGS with >30x and Use 1000 for WES with ~100x maxcov = 300 # Path to SNP list -# If unspecified, HATCHet selects a list of known germline SNPs based on and +# If unspecified, HATCHet2 selects a list of known germline SNPs based on and # If not, please provide full path to a locally stored list (.vcf.gz) here. snps = diff --git a/src/hatchet.egg-info/PKG-INFO b/src/hatchet.egg-info/PKG-INFO new file mode 100644 index 00000000..e8d56139 --- /dev/null +++ b/src/hatchet.egg-info/PKG-INFO @@ -0,0 +1,46 @@ +Metadata-Version: 2.1 +Name: hatchet +Version: 2.1.0 +Summary: A package to infer allele and clone-specific copy-number aberrations (CNAs). +Author-email: Simone Zaccaria , Ben Raphael , Matt Myers , Brian Arnold , Vineet Bansal , Metin Balaban +Project-URL: Homepage, https://github.com/raphael-group/hatchet +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: BSD License +Classifier: Operating System :: POSIX :: Linux +Requires-Python: >=3.7 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: pybedtools +Requires-Dist: biopython +Requires-Dist: hmmlearn +Requires-Dist: matplotlib +Requires-Dist: pandas +Requires-Dist: psutil +Requires-Dist: pyomo +Requires-Dist: pysam +Requires-Dist: requests +Requires-Dist: seaborn +Requires-Dist: scikit-learn +Requires-Dist: scipy +Requires-Dist: statsmodels +Provides-Extra: dev +Requires-Dist: pre-commit; extra == "dev" +Requires-Dist: pytest>=6; extra == "dev" +Requires-Dist: pytest-cov; extra == "dev" +Requires-Dist: mock; extra == "dev" +Requires-Dist: coverage; extra == "dev" +Requires-Dist: numpydoc; extra == "dev" +Requires-Dist: sphinx; extra == "dev" +Requires-Dist: sphinxcontrib-bibtex<2.0.0; extra == "dev" +Requires-Dist: sphinx-rtd-theme; extra == "dev" +Requires-Dist: recommonmark; extra == "dev" +Requires-Dist: sphinx-markdown-tables; extra == "dev" + +![CI](https://github.com/raphael-group/hatchet/workflows/CI/badge.svg) +[![codecov](https://codecov.io/gh/raphael-group/hatchet/branch/master/graph/badge.svg)](https://codecov.io/gh/raphael-group/hatchet) + +# HATCHet2 + +HATCHet2 is an algorithm to infer allele and clone-specific copy-number aberrations (CNAs), clone proportions, and whole-genome duplications (WGD) for several tumor clones jointly from multiple bulk-tumor samples of the same patient or from a single bulk-tumor sample. + +Complete documentation for HATCHet2 is available at [https://raphael-group.github.io/hatchet/](https://raphael-group.github.io/hatchet/) diff --git a/src/hatchet.egg-info/SOURCES.txt b/src/hatchet.egg-info/SOURCES.txt new file mode 100644 index 00000000..d6102567 --- /dev/null +++ b/src/hatchet.egg-info/SOURCES.txt @@ -0,0 +1,94 @@ +CMakeLists.txt +FindGUROBI.cmake +LICENSE +MANIFEST.in +README.md +pyproject.toml +setup.py +custom/GATK4-CNV/allelecn.png +custom/GATK4-CNV/cbb.png +custom/GATK4-CNV/custom-gatk4-cnv.sh +custom/GATK4-CNV/demo-gatk4-cnv.md +custom/GATK4-CNV/demo-gatk4-cnv.sh +custom/GATK4-CNV/gatk4cnsToBB.py +custom/GATK4-CNV/profiles.png +custom/GATK4-CNV/sample1.GATK4.CNV.seg +custom/GATK4-CNV/sample2.GATK4.CNV.seg +custom/GATK4-CNV/totalcn.png +custom/setup-preprocess/setup.md +custom/setup-preprocess/setup.sh +script/README.md +script/hatchet.ini +src/argparse.cpp +src/argparse.h +src/bbc_instance.cpp +src/bbc_instance.h +src/coordinate_descent.cpp +src/coordinate_descent.h +src/gurobi-utils.h +src/ilp-min.cpp +src/ilp-min.h +src/input_instance.cpp +src/input_instance.h +src/solve.cpp +src/utils.cpp +src/utils.h +src/worker.cpp +src/worker.h +src/hatchet/__init__.py +src/hatchet/__main__.py +src/hatchet/hatchet.ini +src/hatchet.egg-info/PKG-INFO +src/hatchet.egg-info/SOURCES.txt +src/hatchet.egg-info/dependency_links.txt +src/hatchet.egg-info/entry_points.txt +src/hatchet.egg-info/not-zip-safe +src/hatchet.egg-info/requires.txt +src/hatchet.egg-info/top_level.txt +src/hatchet/bin/HATCHet-preprocess.py +src/hatchet/bin/HATCHet.py +src/hatchet/bin/__init__.py +src/hatchet/data/__init__.py +src/hatchet/data/hg19.centromeres.txt +src/hatchet/data/hg38.centromeres.txt +src/hatchet/data/sample.bbc +src/hatchet/data/sample.seg +src/hatchet/data/sample.sorted.bam +src/hatchet/data/sample.sorted.gff.gz +src/hatchet/utils/ArgParsing.py +src/hatchet/utils/BAMBinning.py +src/hatchet/utils/CoordinateFinding.py +src/hatchet/utils/ProgressBar.py +src/hatchet/utils/Supporting.py +src/hatchet/utils/TotalCounting.py +src/hatchet/utils/__init__.py +src/hatchet/utils/check.py +src/hatchet/utils/check_solver.py +src/hatchet/utils/cluster_bins.py +src/hatchet/utils/cluster_bins_gmm.py +src/hatchet/utils/combine_counts.py +src/hatchet/utils/combine_counts_fw.py +src/hatchet/utils/commands.py +src/hatchet/utils/config.py +src/hatchet/utils/count_alleles.py +src/hatchet/utils/count_reads.py +src/hatchet/utils/count_reads_fw.py +src/hatchet/utils/download_panel.py +src/hatchet/utils/genotype_snps.py +src/hatchet/utils/multiprocessing.py +src/hatchet/utils/phase_snps.py +src/hatchet/utils/plot_bins.py +src/hatchet/utils/plot_bins_1d2d.py +src/hatchet/utils/plot_cn.py +src/hatchet/utils/plot_cn_1d2d.py +src/hatchet/utils/rd_gccorrect.py +src/hatchet/utils/run.py +src/hatchet/utils/solve/__init__.py +src/hatchet/utils/solve/cd.py +src/hatchet/utils/solve/ilp_subset.py +src/hatchet/utils/solve/utils.py +tests/test_config.py +tests/test_phase.py +tests/test_solver.py +tests/test_steps.py +tests/test_steps_vw.py diff --git a/src/hatchet.egg-info/dependency_links.txt b/src/hatchet.egg-info/dependency_links.txt new file mode 100644 index 00000000..e69de29b diff --git a/src/hatchet.egg-info/entry_points.txt b/src/hatchet.egg-info/entry_points.txt new file mode 100644 index 00000000..c2827185 --- /dev/null +++ b/src/hatchet.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +hatchet = hatchet.__main__:main diff --git a/src/hatchet.egg-info/not-zip-safe b/src/hatchet.egg-info/not-zip-safe new file mode 100644 index 00000000..e69de29b diff --git a/src/hatchet.egg-info/requires.txt b/src/hatchet.egg-info/requires.txt new file mode 100644 index 00000000..acf21e10 --- /dev/null +++ b/src/hatchet.egg-info/requires.txt @@ -0,0 +1,26 @@ +pybedtools +biopython +hmmlearn +matplotlib +pandas +psutil +pyomo +pysam +requests +seaborn +scikit-learn +scipy +statsmodels + +[dev] +pre-commit +pytest>=6 +pytest-cov +mock +coverage +numpydoc +sphinx +sphinxcontrib-bibtex<2.0.0 +sphinx-rtd-theme +recommonmark +sphinx-markdown-tables diff --git a/src/hatchet.egg-info/top_level.txt b/src/hatchet.egg-info/top_level.txt new file mode 100644 index 00000000..915784bb --- /dev/null +++ b/src/hatchet.egg-info/top_level.txt @@ -0,0 +1 @@ +hatchet diff --git a/src/hatchet/bin/HATCHet.py b/src/hatchet/bin/HATCHet.py index 83f7d0e5..b885f433 100644 --- a/src/hatchet/bin/HATCHet.py +++ b/src/hatchet/bin/HATCHet.py @@ -719,7 +719,7 @@ def main(args=None): def parse_clonal_diploid(clonal): """ Given a list of clonal cluster copy numbers, this function tries to order them to be compatible - with the HATCHet C++ factorization module. + with the HATCHet2 C++ factorization module. For diploid scaling, this module requires: -the first cluster is indicated with copy-number 1,1 -the second cluster has a total copy number different from 2 @@ -743,7 +743,7 @@ def parse_clonal_diploid(clonal): raise ValueError( error( "No cluster was indicated as (1,1) in argument to 'clonal' with " - "'diploid'=True. HATCHet solving module requires specification of (1,1) cluster for the " + "'diploid'=True. HATCHet2 solving module requires specification of (1,1) cluster for the " "'clonal' argument to be used in this case." ) ) @@ -783,7 +783,7 @@ def parse_clonal_diploid(clonal): def parse_clonal_tetraploid(clonal): """ Given a list of clonal cluster copy numbers, this function tries to order them to be compatible - with the HATCHet C++ factorization module. + with the HATCHet2 C++ factorization module. For tetraploid scaling, this module requires: -the first cluster is indicated with copy-number 2,2 -the second cluster has a total copy number different from 4 @@ -807,7 +807,7 @@ def parse_clonal_tetraploid(clonal): raise ValueError( error( "No cluster was indicated as (2,2) in argument to 'clonal' with " - "'tetraploid'=True. HATCHet solving module requires specification of (2,2) cluster for the " + "'tetraploid'=True. HATCHet2 solving module requires specification of (2,2) cluster for the " "'clonal' argument to be used in this case." ) ) diff --git a/src/hatchet/utils/check.py b/src/hatchet/utils/check.py index 384ab404..f3cae24a 100644 --- a/src/hatchet/utils/check.py +++ b/src/hatchet/utils/check.py @@ -261,7 +261,7 @@ def _check_python_import(which): def main(hatchet_cmds=None): all_ok = True hatchet_cmds = hatchet_cmds or all_commands - print("======================\nRunning HATCHet checks\n======================") + print("======================\nRunning HATCHet2 checks\n======================") _pred_cache = {} diff --git a/src/hatchet/utils/combine_counts.py b/src/hatchet/utils/combine_counts.py index 98589bee..f06010d2 100644 --- a/src/hatchet/utils/combine_counts.py +++ b/src/hatchet/utils/combine_counts.py @@ -113,7 +113,7 @@ def main(args=None): big_bb["CORRECTED_READS"] = np.NAN - # For each sample, correct read counts to account for differences in coverage (as in HATCHet) + # For each sample, correct read counts to account for differences in coverage (as in HATCHet2) # (i.e., multiply read counts by total-reads-normal/total-reads-sample) rc = pd.read_table(args["totalcounts"], header=None, names=["SAMPLE", "#READS"]) normal_name = all_names[0] @@ -153,7 +153,7 @@ def main(args=None): # perform GC bias correction autosomal_bb = rd_gccorrect(autosomal_bb, referencefasta) - # Convert intervals from closed to half-open to match .1bed/HATCHet standard format + # Convert intervals from closed to half-open to match .1bed/HATCHet2 standard format autosomal_bb.END = autosomal_bb.END + 1 autosomal_bb.to_csv(outfile, index=False, sep="\t") @@ -168,13 +168,13 @@ def main(args=None): def read_snps(baf_file, ch, all_names, phasefile=None): """ - Read and validate SNP data for this patient (TSV table output from HATCHet deBAF.py). + Read and validate SNP data for this patient (TSV table output from HATCHet2 deBAF.py). """ all_names = all_names[ 1: ] # remove normal sample -- not looking for SNP counts from normal - # Read in HATCHet BAF table + # Read in HATCHet2 BAF table all_snps = pd.read_table( baf_file, names=["CHR", "POS", "SAMPLE", "ALT", "REF"], @@ -1148,7 +1148,7 @@ def run_chromosome( test_alpha, ): """ - Perform adaptive binning and infer BAFs to produce a HATCHet BB file for a single chromosome. + Perform adaptive binning and infer BAFs to produce a HATCHet2 BB file for a single chromosome. """ try: diff --git a/src/hatchet/utils/commands.py b/src/hatchet/utils/commands.py index b6a3b103..347742df 100644 --- a/src/hatchet/utils/commands.py +++ b/src/hatchet/utils/commands.py @@ -1,4 +1,4 @@ -# All supported HATCHet commands +# All supported HATCHet2 commands commands = ( "count-reads", "count-reads-fw", @@ -20,7 +20,7 @@ ) -# Support for old command names as they've been used in earlier versions of HATCHet +# Support for old command names as they've been used in earlier versions of HATCHet2 command_aliases = { "binBAM": "count-reads-fw", "SNPCaller": "genotype-snps", diff --git a/src/hatchet/utils/count_reads.py b/src/hatchet/utils/count_reads.py index 5a116d50..c3890892 100644 --- a/src/hatchet/utils/count_reads.py +++ b/src/hatchet/utils/count_reads.py @@ -302,13 +302,13 @@ def count_chromosome_wrapper(param): def read_snps(baf_file, ch, all_names): """ - Read and validate SNP data for this patient (TSV table output from HATCHet deBAF.py). + Read and validate SNP data for this patient. """ all_names = all_names[ 1: ] # remove normal sample -- not looking for SNP counts from normal - # Read in HATCHet BAF table + # Read in HATCHet2 BAF table all_snps = pd.read_table( baf_file, names=["CHR", "POS", "SAMPLE", "REF", "ALT"], diff --git a/src/hatchet/utils/download_panel.py b/src/hatchet/utils/download_panel.py index 24f83eff..f8540ba8 100644 --- a/src/hatchet/utils/download_panel.py +++ b/src/hatchet/utils/download_panel.py @@ -93,7 +93,8 @@ def mod_chain(infile, sample_chr, refpanel_index, sample_index): def dwnld_refpanel_genome(path): newref = os.path.join(path, "hg19_no_chr.fa") if not os.path.isfile(newref): - # If the genome reference file used in other parts of HATCHet matches the one we want, use it + + # If the genome reference file used in other parts of HATCHet2 matches the one we want, use it reference_file = config.paths.reference if ( os.path.isfile(reference_file) diff --git a/src/hatchet/utils/plot_cn_1d2d.py b/src/hatchet/utils/plot_cn_1d2d.py index c0120865..5fc7484e 100644 --- a/src/hatchet/utils/plot_cn_1d2d.py +++ b/src/hatchet/utils/plot_cn_1d2d.py @@ -52,7 +52,7 @@ def generate_1D2D_plots( resample_balanced=False, ): if "#CHR" not in bbc: - # throw HATCHet error + # throw HATCHet2 error raise ValueError("Input table is malformed (missing #CHR column)") # Prepend 'chr' to #CHR column if not already present From d78d2fe9837cf0c22bae6329914f514f92c6f871 Mon Sep 17 00:00:00 2001 From: Matt Myers Date: Mon, 25 Nov 2024 13:54:48 -0500 Subject: [PATCH 2/2] incorporate ruff edits --- docs/source/conf.py | 6 +++--- src/hatchet/utils/download_panel.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 9ebfff0c..5f149f66 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -63,9 +63,9 @@ master_doc = "index" # General information about the project. -project = 'HATCHet2' -copyright = '2024, Princeton University' -author = 'Matthew Myers, Simone Zaccaria, Vineet Bansal, and Brian Arnold' +project = "HATCHet2" +copyright = "2024, Princeton University" +author = "Matthew Myers, Simone Zaccaria, Vineet Bansal, and Brian Arnold" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/src/hatchet/utils/download_panel.py b/src/hatchet/utils/download_panel.py index f8540ba8..edb0f940 100644 --- a/src/hatchet/utils/download_panel.py +++ b/src/hatchet/utils/download_panel.py @@ -93,7 +93,6 @@ def mod_chain(infile, sample_chr, refpanel_index, sample_index): def dwnld_refpanel_genome(path): newref = os.path.join(path, "hg19_no_chr.fa") if not os.path.isfile(newref): - # If the genome reference file used in other parts of HATCHet2 matches the one we want, use it reference_file = config.paths.reference if (