From 49fe9d448b17522434398b1ecf2ddec43069b655 Mon Sep 17 00:00:00 2001 From: ksamuk Date: Wed, 9 Feb 2022 21:47:25 -0800 Subject: [PATCH] fixed sites file numeric chromosome edge case --- README.md | 2 +- conda.recipe/meta.yaml | 2 +- docs/companions.rst | 2 ++ docs/index.rst | 2 +- pixy/__main__.py | 9 ++++++++- pixy/core.py | 7 ++++++- setup.py | 2 +- 7 files changed, 20 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7018758..c1028cb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ``pixy`` ==================== -[![DOI](https://zenodo.org/badge/181987337.svg)](https://zenodo.org/badge/latestdoi/181987337) ![version](https://img.shields.io/badge/version-1.2.5.beta1-blue) [![MIT License](https://img.shields.io/apm/l/atomic-design-ui.svg?)](https://github.com/tterb/atomic-design-ui/blob/master/LICENSEs) +[![DOI](https://zenodo.org/badge/181987337.svg)](https://zenodo.org/badge/latestdoi/181987337) ![version](https://img.shields.io/badge/version-1.2.6.beta1-blue) [![MIT License](https://img.shields.io/apm/l/atomic-design-ui.svg?)](https://github.com/tterb/atomic-design-ui/blob/master/LICENSEs) `pixy` is a command-line tool for painlessly estimating average nucleotide diversity within (π) and between (dxy) populations from a VCF. In particular, pixy facilitates the use of VCFs containing invariant (monomorphic) sites, which are **essential** for the correct computation of π and dxy in the face of missing data (i.e. always). diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index e7f0dd1..4b8a481 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "pixy" %} -{% set version = "1.0.4.beta1" %} +{% set version = "1.2.6.beta1" %} package: name: {{ name|lower }} diff --git a/docs/companions.rst b/docs/companions.rst index a31e1b7..a7b8852 100644 --- a/docs/companions.rst +++ b/docs/companions.rst @@ -33,6 +33,8 @@ BED file The BED file is used to manually defined genome regions over which to calculate summary statistics. It is specified at the command line with the ``--bed_file``. +Note that pixy expects intervals to be one indexed, *not* zero indexed as many typical BED files are. If your BED file is zero indexed, you'll need to convert the intervals (e.g. by adding 1 to the start and end). + *Format* A headerless, tab separated file with the first column containing a chromosome ID, the second column containing the first position of a window, and the third column containing the last potion of the window. Each row = one window. diff --git a/docs/index.rst b/docs/index.rst index 7fef38c..d802645 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,7 @@ .. raw:: html -

pixy 1.2.5.beta1

+

pixy 1.2.6.beta1

.. image:: images/pixy_logo.png :width: 200 diff --git a/pixy/__main__.py b/pixy/__main__.py index e87c31a..e46588b 100644 --- a/pixy/__main__.py +++ b/pixy/__main__.py @@ -42,7 +42,7 @@ def main(): help_image = "█▀▀█ ░▀░ █░█ █░░█\n" "█░░█ ▀█▀ ▄▀▄ █▄▄█\n" "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n" help_text = 'pixy: unbiased estimates of pi, dxy, and fst from VCFs with invariant sites' - version = '1.2.5.beta1' + version = '1.2.6.beta1' citation = 'Korunes, KL and K Samuk. pixy: Unbiased estimation of nucleotide diversity and divergence in the presence of missing data. Mol Ecol Resour. 2021 Jan 16. doi: 10.1111/1755-0998.13326.' # initialize arguments @@ -76,6 +76,10 @@ def main(): optional.add_argument('--silent', action='store_true', help='Suppress all console output.') optional.add_argument('--debug', action='store_true', help=argparse.SUPPRESS) optional.add_argument('--keep_temp_file', action='store_true', help=argparse.SUPPRESS) + + + + #args = parser.parse_args("--debug --bed_file testing/debugging_data/tal/bed_test.bed --sites_file testing/debugging_data/tal/sites_test.txt --stats pi --vcf testing/debugging_data/tal/sample_vcf.gz --populations testing/debugging_data/tal/popfile.txt --output_folder testing/notebook_output".split()) # catch arguments from the command line # automatically uncommented when a release is built @@ -172,6 +176,7 @@ def listener(q, temp_file): interval_end = int(chrom_max[0]) else: sites_df = pandas.read_csv(args.sites_file, sep='\t', usecols=[0,1], names=['CHROM', 'POS']) + sites_df['CHROM'] = sites_df['CHROM'].astype(str) sites_pre_list = sites_df[sites_df['CHROM'] == chromosome] sites_pre_list = sorted(sites_pre_list['POS'].tolist()) interval_start = min(sites_pre_list) @@ -239,8 +244,10 @@ def listener(q, temp_file): # if using a sites file, assign sites to chunks, as with windows above if args.sites_file is not None: sites_df = pandas.read_csv(args.sites_file, sep='\t', usecols=[0,1], names=['CHROM', 'POS']) + sites_df['CHROM'] = sites_df['CHROM'].astype(str) sites_pre_list = sites_df[sites_df['CHROM'] == chromosome] sites_pre_list = sites_pre_list['POS'].tolist() + sites_list = pixy.core.assign_sites_to_chunks(sites_pre_list, args.chunk_size) else: sites_df = None diff --git a/pixy/core.py b/pixy/core.py index ba3a14f..46e4864 100644 --- a/pixy/core.py +++ b/pixy/core.py @@ -193,7 +193,7 @@ def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2, site # if a list of target sites was specified, mask out all non-target sites if sites_list_chunk is not None: gt_array = mask_non_target_sites(gt_array, pos_array, sites_list_chunk) - + # extra 'none' check to catch cases where every site was removed by the mask if len(gt_array) == 0: callset_is_none = True @@ -209,6 +209,7 @@ def compute_summary_stats(args, popnames, popindices, temp_file, chromosome, chu # read in the genotype data for the chunk callset_is_none, gt_array, pos_array = read_and_filter_genotypes(args, chromosome, chunk_pos_1, chunk_pos_2, sites_list_chunk) + # if computing FST, pre-compute a filtered array of variants (only) if 'fst' in args.stats: @@ -275,6 +276,8 @@ def compute_summary_stats(args, popnames, popindices, temp_file, chromosome, chu window_pos_1 = window_list_chunk[window_index][0] window_pos_2 = window_list_chunk[window_index][1] + + if pos_array is None: window_is_empty = True @@ -299,6 +302,7 @@ def compute_summary_stats(args, popnames, popindices, temp_file, chromosome, chu if (len(gt_region) == 0 or (loc_region is None) or (gt_region is None)): window_is_empty = True + # PI: # AVERAGE NUCLEOTIDE DIFFERENCES WITHIN POPULATIONS @@ -702,6 +706,7 @@ def check_and_validate_args(args): bed_df.columns = ['chrom', 'chromStart', 'chromEnd'] bed_chrom = list(bed_df['chrom']) missing = list(set(bed_chrom)-set(chrom_all)) + chrom_all = list(set(chrom_all) & set(bed_chrom)) chrom_list = list(set(chrom_all) & set(bed_chrom)) if len(missing) >0: diff --git a/setup.py b/setup.py index 13a5f49..76b8275 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='pixy', - version='1.2.5.beta1', + version='1.2.6.beta1', packages=['pixy'], entry_points={ 'console_scripts': [