fixed sites file numeric chromosome edge case

ksamuk · Feb 10, 2022 · 49fe9d4 · 49fe9d4
1 parent ec4ec03
commit 49fe9d4
Show file tree

Hide file tree

Showing 7 changed files with 20 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 ``pixy``<img src="https://raw.githubusercontent.com/ksamuk/pixy/master/docs/images/pixy_logo.png" align="right" width="20%">
 ====================
 
-[![DOI](https://zenodo.org/badge/181987337.svg)](https://zenodo.org/badge/latestdoi/181987337) ![version](https://img.shields.io/badge/version-1.2.5.beta1-blue) [![MIT License](https://img.shields.io/apm/l/atomic-design-ui.svg?)](https://github.com/tterb/atomic-design-ui/blob/master/LICENSEs) 
+[![DOI](https://zenodo.org/badge/181987337.svg)](https://zenodo.org/badge/latestdoi/181987337) ![version](https://img.shields.io/badge/version-1.2.6.beta1-blue) [![MIT License](https://img.shields.io/apm/l/atomic-design-ui.svg?)](https://github.com/tterb/atomic-design-ui/blob/master/LICENSEs) 
 
 `pixy` is a command-line tool for painlessly estimating average nucleotide diversity within (π) and between (d<sub>xy</sub>) populations from a VCF. In particular, pixy facilitates the use of VCFs containing invariant (monomorphic) sites, which are **essential** for the correct computation of π and d<sub>xy</sub> in the face of missing data (i.e. always).
 

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "pixy" %}
-{% set version = "1.0.4.beta1" %}
+{% set version = "1.2.6.beta1" %}
 
 package:
   name: {{ name|lower }}

diff --git a/docs/companions.rst b/docs/companions.rst
@@ -33,6 +33,8 @@ BED file
 
 The BED file is used to manually defined genome regions over which to calculate summary statistics. It is specified at the command line with the ``--bed_file``.
 
+Note that pixy expects intervals to be one indexed, *not* zero indexed as many typical BED files are. If your BED file is zero indexed, you'll need to convert the intervals (e.g. by adding 1 to the start and end).
+
 *Format*
 
 A headerless, tab separated file with the first column containing a chromosome ID, the second column containing the first position of a window, and the third column containing the last potion of the window. Each row = one window.

diff --git a/docs/index.rst b/docs/index.rst
@@ -5,7 +5,7 @@
 
 .. raw:: html
 
-    <div align="center"><h1> pixy 1.2.5.beta1 </h1></div>
+    <div align="center"><h1> pixy 1.2.6.beta1 </h1></div>
 
 .. image:: images/pixy_logo.png
    :width: 200

diff --git a/pixy/__main__.py b/pixy/__main__.py
@@ -42,7 +42,7 @@ def main():
     help_image = "█▀▀█ ░▀░ █░█ █░░█\n"     "█░░█ ▀█▀ ▄▀▄ █▄▄█\n"     "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n"
 
     help_text = 'pixy: unbiased estimates of pi, dxy, and fst from VCFs with invariant sites'
-    version = '1.2.5.beta1'
+    version = '1.2.6.beta1'
     citation = 'Korunes, KL and K Samuk. pixy: Unbiased estimation of nucleotide diversity and divergence in the presence of missing data. Mol Ecol Resour. 2021 Jan 16. doi: 10.1111/1755-0998.13326.'
 
     # initialize arguments
@@ -76,6 +76,10 @@ def main():
     optional.add_argument('--silent', action='store_true', help='Suppress all console output.')
     optional.add_argument('--debug', action='store_true', help=argparse.SUPPRESS)
     optional.add_argument('--keep_temp_file', action='store_true', help=argparse.SUPPRESS)
+
+
+
+    #args = parser.parse_args("--debug --bed_file testing/debugging_data/tal/bed_test.bed --sites_file testing/debugging_data/tal/sites_test.txt  --stats pi --vcf testing/debugging_data/tal/sample_vcf.gz --populations testing/debugging_data/tal/popfile.txt --output_folder testing/notebook_output".split())
 
     # catch arguments from the command line
     # automatically uncommented when a release is built
@@ -172,6 +176,7 @@ def listener(q, temp_file):
                     interval_end = int(chrom_max[0])
                 else:
                     sites_df = pandas.read_csv(args.sites_file, sep='\t', usecols=[0,1], names=['CHROM', 'POS'])
+                    sites_df['CHROM'] = sites_df['CHROM'].astype(str)
                     sites_pre_list = sites_df[sites_df['CHROM'] == chromosome]
                     sites_pre_list = sorted(sites_pre_list['POS'].tolist())
                     interval_start = min(sites_pre_list)
@@ -239,8 +244,10 @@ def listener(q, temp_file):
         # if using a sites file, assign sites to chunks, as with windows above
         if args.sites_file is not None:
             sites_df = pandas.read_csv(args.sites_file, sep='\t', usecols=[0,1], names=['CHROM', 'POS'])
+            sites_df['CHROM'] = sites_df['CHROM'].astype(str)
             sites_pre_list = sites_df[sites_df['CHROM'] == chromosome]
             sites_pre_list = sites_pre_list['POS'].tolist()
+
             sites_list = pixy.core.assign_sites_to_chunks(sites_pre_list, args.chunk_size)
         else:
             sites_df = None

diff --git a/pixy/core.py b/pixy/core.py
@@ -193,7 +193,7 @@ def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2, site
         # if a list of target sites was specified, mask out all non-target sites
         if sites_list_chunk is not None:
             gt_array = mask_non_target_sites(gt_array, pos_array, sites_list_chunk)
-        
+
         # extra 'none' check to catch cases where every site was removed by the mask
         if len(gt_array) == 0:
             callset_is_none = True
@@ -209,6 +209,7 @@ def compute_summary_stats(args, popnames, popindices, temp_file, chromosome, chu
     # read in the genotype data for the chunk
     callset_is_none, gt_array, pos_array = read_and_filter_genotypes(args, chromosome, chunk_pos_1, chunk_pos_2, sites_list_chunk)
 
+
     # if computing FST, pre-compute a filtered array of variants (only)
     if 'fst' in args.stats:
 
@@ -275,6 +276,8 @@ def compute_summary_stats(args, popnames, popindices, temp_file, chromosome, chu
 
         window_pos_1 = window_list_chunk[window_index][0]
         window_pos_2 = window_list_chunk[window_index][1]
+
+
 
         if pos_array is None:
             window_is_empty = True
@@ -299,6 +302,7 @@ def compute_summary_stats(args, popnames, popindices, temp_file, chromosome, chu
 
             if (len(gt_region) == 0 or (loc_region is None) or (gt_region is None)):
                 window_is_empty = True
+
 
         # PI:
         # AVERAGE NUCLEOTIDE DIFFERENCES WITHIN POPULATIONS
@@ -702,6 +706,7 @@ def check_and_validate_args(args):
             bed_df.columns = ['chrom', 'chromStart', 'chromEnd']
             bed_chrom = list(bed_df['chrom'])
             missing = list(set(bed_chrom)-set(chrom_all))
+            chrom_all = list(set(chrom_all) & set(bed_chrom))
             chrom_list = list(set(chrom_all) & set(bed_chrom))
 
         if len(missing) >0:

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 
 setup(
     name='pixy',
-    version='1.2.5.beta1',
+    version='1.2.6.beta1',
     packages=['pixy'],
     entry_points={
         'console_scripts': [