From 8d385f286b6da169db3f5d645e09c571ef3b1cca Mon Sep 17 00:00:00 2001
From: ksamuk <ksamuk@gmail.com>
Date: Fri, 3 Apr 2020 08:03:12 -0400
Subject: [PATCH] improved console output

---
 .Rhistory        | 26 ----------------
 pixy/__main__.py | 81 +++++++++++++++++++++++++++++++-----------------
 setup.py         |  2 +-
 3 files changed, 54 insertions(+), 55 deletions(-)
 delete mode 100644 .Rhistory

diff --git a/.Rhistory b/.Rhistory
deleted file mode 100644
index b4844e0..0000000
--- a/.Rhistory
+++ /dev/null
@@ -1,26 +0,0 @@
-library('tidyverse')
-meta_df<- read.table("data/samples.meta.txt")
-meta_df<- read.table("data/samples.meta.txt", h = T)
-meta_df<- read_delim("data/samples.meta.txt", h = T)
-meta_df<- read_delim("data/samples.meta.txt")
-meta_df<- read.table("data/samples.meta.txt", h = T)
-meta_df<- read.table("data/samples.meta.txt", h = T, sep = "\t")
-meta_df<- read.table("data/samples.meta.txt", h = F, sep = "\t")
-meta_df<- read_delim("data/samples.meta.txt", h = F, sep = "\t")
-meta_df<- read_delim("data/samples.meta.txt")
-meta_df<- read_delim("data/samples.meta.txt", delim = "\t")
-head(meta_df<)
-head(meta_df)
-samp_df <- read_delim("data/An1000g_Samples.csv")
-samp_df <- read_delim("data/An1000g_Samples.csv", delim = ",")
-samp_df <- read_delim("data/An1000g_Samples - Sheet3.csv", delim = ",")
-head(samp_df)
-samp_df <- read_delim("data/An1000g_Samples - Sheet3.csv", delim = ",") %>%
-select(Population, Sample)
-head(samp_df)
-meta_df<- read_delim("data/samples.meta.txt", delim = "\t") %>%
-rename(Sample = ebi_sample_acc)
-samp_df %>% left_join(meta_df)
-samp_df %>% left_join(meta_df) %>% View
-samp_df %>% left_join(meta_df) %>%
-write.csv("data/meta_join.csv")
diff --git a/pixy/__main__.py b/pixy/__main__.py
index badff33..cbab9d4 100644
--- a/pixy/__main__.py
+++ b/pixy/__main__.py
@@ -34,7 +34,7 @@ def main(args=None):
     # initialize all the aruments
     parser = argparse.ArgumentParser(description=help_image+help_text, formatter_class=argparse.RawTextHelpFormatter)
     
-    parser.add_argument('--version', action='version', version='%(prog)s version 0.93.3')
+    parser.add_argument('--version', action='version', version='%(prog)s version 0.93.4')
     parser.add_argument('--stats', nargs='+', choices=['pi', 'dxy', 'fst'], help='Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)', required=True)
     parser.add_argument('--vcf', type=str, nargs='?', help='Path to the input VCF', required=True)
     parser.add_argument('--zarr_path', type=str, nargs='?', help='Folder in which to build the Zarr array(s)', required=True)
@@ -51,14 +51,27 @@ def main(args=None):
     parser.add_argument('--fst_maf_filter', default=0.0, type=float, nargs='?', help='Minor allele frequency filter for FST calculations, with value 0.0-1.0. Sites with MAF less than this value will be excluded.')
     
     # catch arguments from the command line
-    
     args = parser.parse_args()
     
     
     
     
+    # CHECK FOR TABIX
+    # (disabled until we implement site level and BED support)
+    #tabix_path = shutil.which("tabix")
+    
+    #if tabix_path is None:
+    #    warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"') 
+    #if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None:
+    #    raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"') 
+    
+    
+    
+    
     # VALIDATE ARGUMENTS
     
+    print("[pixy] Validating inputs...")
+    
     # get vcf header info
     vcf_headers = allel.read_vcf_headers(args.vcf)
     
@@ -76,10 +89,10 @@ def main(args=None):
         filter_fields.append(re.sub("[^A-Za-z]+", "", x))
         
     missing = list(set(filter_fields)-set(format_fields))
-        
+    
     if len(missing) >0:
-        raise Exception('ERROR: the following genotype filters were requested but not occur in the VCF:', missing) 
-        
+        raise Exception('[pixy] ERROR: the following genotype filters were requested but not occur in the VCF:', missing) 
+    
     # CHROMOSOMES
     
     # check if the vcf is zipped
@@ -99,19 +112,19 @@ def main(args=None):
         chrom_all = subprocess.check_output(cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split()
         missing = list(set(chrom_list)-set(chrom_all))
         if len(missing) >0:
-            raise Exception('ERROR: the following chromosomes were requested but not occur in the VCF:', missing) 
+            raise Exception('[pixy] ERROR: the following chromosomes were requested but not occur in the VCF:', missing) 
     
     # INTERVALS
     # check if intervals are correctly specified
     
     if args.interval_start is not None and args.interval_end is None:
-        raise Exception('ERROR: Both --interval_start and --interval_end must be specified') 
+        raise Exception('[pixy] ERROR: Both --interval_start and --interval_end must be specified') 
         
     if args.interval_start is None and args.interval_end is not None:
-        raise Exception('ERROR: Both --interval_start and --interval_end must be specified') 
+        raise Exception('[pixy] ERROR: Both --interval_start and --interval_end must be specified') 
         
     if args.interval_start is not None and args.interval_end is not None and len(chrom_list) > 1:
-        raise Exception('ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.') 
+        raise Exception('[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.') 
     
     # SAMPLES
     # check if requested samples exist in vcf
@@ -136,7 +149,7 @@ def main(args=None):
     try:
         samples_callset_index = [samples_list.index(s) for s in poppanel['ID']]
     except ValueError as e:
-        raise Exception('ERROR: the following samples are listed in the population file but not in the VCF:', missing) from e
+        raise Exception('[pixy] ERROR: the following samples are listed in the population file but not in the VCF:', missing) from e
     else:   
         poppanel['callset_index'] = samples_callset_index
     
@@ -146,7 +159,7 @@ def main(args=None):
         for name in popnames:
             popindices[name] = poppanel[poppanel.Population == name].callset_index.values
     
-    print ("Preparing for calculation of summary statistics for " + str(len(chrom_list)) +" chromosome(s), and " + str(len(IDs)) + " sample(s)...")
+    print ("[pixy] Preparing for calculation of summary statistics for " + str(len(chrom_list)) +" chromosome(s), and " + str(len(IDs)) + " sample(s)...")
     
     
     
@@ -197,7 +210,7 @@ def main(args=None):
     
     # time the calculations
     start_time = time.time()
-    print("Started calculations at " + time.strftime("%H:%M:%S", time.localtime(start_time)) + " local time")
+    print("[pixy] Started calculations at " + time.strftime("%H:%M:%S", time.localtime(start_time)) + " local time")
     
     for chromosome in chrom_list:
          
@@ -205,19 +218,32 @@ def main(args=None):
         
         # the chromosome specific zarr path
         zarr_path = args.zarr_path + "/" + chromosome
-           
+        
+        # determine the fields that will be included
+        # TBD: just reading all fields currently
+        # vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)]
+        
+        # build region string (if using an interval)
+        if args.interval_start is not None:
+            targ_region = chromosome + ":" + str(args.interval_start) + "-" + str(args.interval_end)
+        else:
+            targ_region = chromosome
+        
         # allow for resuse of previously calculated zarr arrays
         if args.reuse_zarr == 'yes' and os.path.exists(zarr_path):
-            print("If a zarr array exists, it will be reused for chromosome " + chromosome + "...")
+            print("[pixy] If a zarr array exists, it will be reused for chromosome " + chromosome + "...")
         elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True:
             if os.path.exists(zarr_path):
                 shutil.rmtree(zarr_path)
             os.mkdir(zarr_path)
-            allel.vcf_to_zarr(args.vcf, zarr_path, region=chromosome, fields='*', overwrite=True)
+            print("[pixy] Building zarr array for chromosome " + chromosome + "...")
+            warnings.filterwarnings("ignore")
+            allel.vcf_to_zarr(args.vcf, zarr_path, region=targ_region, fields='*', overwrite=True)
+            warnings.resetwarnings()
             
-        print("Calculating statistics for chromosome " + chromosome + "...")
+        print("[pixy] Calculating statistics for chromosome " + targ_region + "...")
         
-        # inspect the structure of the zarr data
+        # open the zarr
         callset = zarr.open_group(zarr_path, mode='r')
     
         # parse the filtration expression and build the boolean filter array
@@ -248,7 +274,7 @@ def main(args=None):
                 try: 
                     stat_index = calldata_fields.index(stat)
                 except ValueError as e:
-                    raise Exception("Error: The requested filter \'" + stat + "\' is not annotated in the input VCF") from e
+                    raise Exception("[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF") from e
                 else: 
     
                     # check if this is the first run through the loop
@@ -411,22 +437,22 @@ def dxyCompareGTs(vec1, vec2): #for dxy
             if (interval_start > interval_end):
                 raise ValueError()        
         except ValueError as e:
-            raise Exception("The specified interval start ("+str(interval_start)+") exceeds the interval end ("+str(interval_end)+")") from e
+            raise Exception("[pixy] ERROR: The specified interval start ("+str(interval_start)+") exceeds the interval end ("+str(interval_end)+")") from e
     
         # catch misspecified intervals
         if (interval_end > max(pos_array)):
-            warnings.warn("The specified interval end ("+str(interval_end)+") exceeds the last position of the chromosome and has been substituted ("+str(max(pos_array))+")")
+            warnings.warn("[pixy] WARNING: The specified interval end ("+str(interval_end)+") exceeds the last position of the chromosome and has been substituted ("+str(max(pos_array))+")")
             interval_end = max(pos_array)
             
         if (interval_start < min(pos_array)):
-            warnings.warn("The specified interval start ("+str(interval_start)+") begins before the first position of the chromosome and has been substituted ("+str(min(pos_array))+")")
+            warnings.warn("[pixy] WARNING: The specified interval start ("+str(interval_start)+") begins before the first position of the chromosome and has been substituted ("+str(min(pos_array))+")")
             interval_start = min(pos_array)
     
         try:           
             if ((interval_end - interval_start + 1) < window_size):
                 raise ValueError()      
         except ValueError as e:
-            raise Exception("The specified interval ("+str(interval_start)+"-"+str(interval_end)+") is too small for the requested window size ("+str(window_size)+")") from e
+            raise Exception("[pixy] ERROR: The specified interval ("+str(interval_start)+"-"+str(interval_end)+") is too small for the requested window size ("+str(window_size)+")") from e
     
         # PI:
         # AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS
@@ -476,13 +502,12 @@ def dxyCompareGTs(vec1, vec2): #for dxy
                 # close output file and print complete message
             outfile.close()
     
-            print("Pi calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_pi.txt")
+            print("[pixy] Pi calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_pi.txt")
     
     
         # DXY:
         # AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS
     
-    
         if (args.populations is not None) and ('dxy' in args.stats):
     
             # create a list of all pairwise comparisons between populations in the popfile
@@ -530,7 +555,7 @@ def dxyCompareGTs(vec1, vec2): #for dxy
                         window_pos_2 = interval_end
     
             outfile.close()
-            print("Dxy calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_dxy.txt")
+            print("[pixy] Dxy calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_dxy.txt")
     
         # FST:
         # WEIR AND COCKERHAMS FST
@@ -581,14 +606,14 @@ def dxyCompareGTs(vec1, vec2): #for dxy
                 for fst,wind,snps in zip(a, b, c):
                     outfile.write(str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" + str(chromosome) + "\t" + str(wind[0]) + "\t" + str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) +"\n")
             outfile.close()
-            print("Fst calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_fst.txt")
+            print("[pixy] Fst calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_fst.txt")
     
     
     
     
-    print("\nAll calculations complete!")
+    print("\n[pixy] All calculations complete!")
     end_time = (time.time() - start_time)
-    print("Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(end_time)))
+    print("[pixy] Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(end_time)))
     
     
     
diff --git a/setup.py b/setup.py
index dd9d3af..60df9be 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name='pixy',
-    version='0.93.3',
+    version='0.93.4',
     packages=['pixy'],
     entry_points={
         'console_scripts': [