Skip to content

Commit

Permalink
improved console output
Browse files Browse the repository at this point in the history
  • Loading branch information
ksamuk committed Apr 3, 2020
1 parent 107402e commit 8d385f2
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 55 deletions.
26 changes: 0 additions & 26 deletions .Rhistory

This file was deleted.

81 changes: 53 additions & 28 deletions pixy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def main(args=None):
# initialize all the aruments
parser = argparse.ArgumentParser(description=help_image+help_text, formatter_class=argparse.RawTextHelpFormatter)

parser.add_argument('--version', action='version', version='%(prog)s version 0.93.3')
parser.add_argument('--version', action='version', version='%(prog)s version 0.93.4')
parser.add_argument('--stats', nargs='+', choices=['pi', 'dxy', 'fst'], help='Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)', required=True)
parser.add_argument('--vcf', type=str, nargs='?', help='Path to the input VCF', required=True)
parser.add_argument('--zarr_path', type=str, nargs='?', help='Folder in which to build the Zarr array(s)', required=True)
Expand All @@ -51,14 +51,27 @@ def main(args=None):
parser.add_argument('--fst_maf_filter', default=0.0, type=float, nargs='?', help='Minor allele frequency filter for FST calculations, with value 0.0-1.0. Sites with MAF less than this value will be excluded.')

# catch arguments from the command line

args = parser.parse_args()




# CHECK FOR TABIX
# (disabled until we implement site level and BED support)
#tabix_path = shutil.which("tabix")

#if tabix_path is None:
# warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"')
#if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None:
# raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"')




# VALIDATE ARGUMENTS

print("[pixy] Validating inputs...")

# get vcf header info
vcf_headers = allel.read_vcf_headers(args.vcf)

Expand All @@ -76,10 +89,10 @@ def main(args=None):
filter_fields.append(re.sub("[^A-Za-z]+", "", x))

missing = list(set(filter_fields)-set(format_fields))

if len(missing) >0:
raise Exception('ERROR: the following genotype filters were requested but not occur in the VCF:', missing)
raise Exception('[pixy] ERROR: the following genotype filters were requested but not occur in the VCF:', missing)

# CHROMOSOMES

# check if the vcf is zipped
Expand All @@ -99,19 +112,19 @@ def main(args=None):
chrom_all = subprocess.check_output(cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq", shell=True).decode("utf-8").split()
missing = list(set(chrom_list)-set(chrom_all))
if len(missing) >0:
raise Exception('ERROR: the following chromosomes were requested but not occur in the VCF:', missing)
raise Exception('[pixy] ERROR: the following chromosomes were requested but not occur in the VCF:', missing)

# INTERVALS
# check if intervals are correctly specified

if args.interval_start is not None and args.interval_end is None:
raise Exception('ERROR: Both --interval_start and --interval_end must be specified')
raise Exception('[pixy] ERROR: Both --interval_start and --interval_end must be specified')

if args.interval_start is None and args.interval_end is not None:
raise Exception('ERROR: Both --interval_start and --interval_end must be specified')
raise Exception('[pixy] ERROR: Both --interval_start and --interval_end must be specified')

if args.interval_start is not None and args.interval_end is not None and len(chrom_list) > 1:
raise Exception('ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.')
raise Exception('[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.')

# SAMPLES
# check if requested samples exist in vcf
Expand All @@ -136,7 +149,7 @@ def main(args=None):
try:
samples_callset_index = [samples_list.index(s) for s in poppanel['ID']]
except ValueError as e:
raise Exception('ERROR: the following samples are listed in the population file but not in the VCF:', missing) from e
raise Exception('[pixy] ERROR: the following samples are listed in the population file but not in the VCF:', missing) from e
else:
poppanel['callset_index'] = samples_callset_index

Expand All @@ -146,7 +159,7 @@ def main(args=None):
for name in popnames:
popindices[name] = poppanel[poppanel.Population == name].callset_index.values

print ("Preparing for calculation of summary statistics for " + str(len(chrom_list)) +" chromosome(s), and " + str(len(IDs)) + " sample(s)...")
print ("[pixy] Preparing for calculation of summary statistics for " + str(len(chrom_list)) +" chromosome(s), and " + str(len(IDs)) + " sample(s)...")



Expand Down Expand Up @@ -197,27 +210,40 @@ def main(args=None):

# time the calculations
start_time = time.time()
print("Started calculations at " + time.strftime("%H:%M:%S", time.localtime(start_time)) + " local time")
print("[pixy] Started calculations at " + time.strftime("%H:%M:%S", time.localtime(start_time)) + " local time")

for chromosome in chrom_list:

# Zarr array conversion

# the chromosome specific zarr path
zarr_path = args.zarr_path + "/" + chromosome


# determine the fields that will be included
# TBD: just reading all fields currently
# vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)]

# build region string (if using an interval)
if args.interval_start is not None:
targ_region = chromosome + ":" + str(args.interval_start) + "-" + str(args.interval_end)
else:
targ_region = chromosome

# allow for resuse of previously calculated zarr arrays
if args.reuse_zarr == 'yes' and os.path.exists(zarr_path):
print("If a zarr array exists, it will be reused for chromosome " + chromosome + "...")
print("[pixy] If a zarr array exists, it will be reused for chromosome " + chromosome + "...")
elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True:
if os.path.exists(zarr_path):
shutil.rmtree(zarr_path)
os.mkdir(zarr_path)
allel.vcf_to_zarr(args.vcf, zarr_path, region=chromosome, fields='*', overwrite=True)
print("[pixy] Building zarr array for chromosome " + chromosome + "...")
warnings.filterwarnings("ignore")
allel.vcf_to_zarr(args.vcf, zarr_path, region=targ_region, fields='*', overwrite=True)
warnings.resetwarnings()

print("Calculating statistics for chromosome " + chromosome + "...")
print("[pixy] Calculating statistics for chromosome " + targ_region + "...")

# inspect the structure of the zarr data
# open the zarr
callset = zarr.open_group(zarr_path, mode='r')

# parse the filtration expression and build the boolean filter array
Expand Down Expand Up @@ -248,7 +274,7 @@ def main(args=None):
try:
stat_index = calldata_fields.index(stat)
except ValueError as e:
raise Exception("Error: The requested filter \'" + stat + "\' is not annotated in the input VCF") from e
raise Exception("[pixy] ERROR: The requested filter \'" + stat + "\' is not annotated in the input VCF") from e
else:

# check if this is the first run through the loop
Expand Down Expand Up @@ -411,22 +437,22 @@ def dxyCompareGTs(vec1, vec2): #for dxy
if (interval_start > interval_end):
raise ValueError()
except ValueError as e:
raise Exception("The specified interval start ("+str(interval_start)+") exceeds the interval end ("+str(interval_end)+")") from e
raise Exception("[pixy] ERROR: The specified interval start ("+str(interval_start)+") exceeds the interval end ("+str(interval_end)+")") from e

# catch misspecified intervals
if (interval_end > max(pos_array)):
warnings.warn("The specified interval end ("+str(interval_end)+") exceeds the last position of the chromosome and has been substituted ("+str(max(pos_array))+")")
warnings.warn("[pixy] WARNING: The specified interval end ("+str(interval_end)+") exceeds the last position of the chromosome and has been substituted ("+str(max(pos_array))+")")
interval_end = max(pos_array)

if (interval_start < min(pos_array)):
warnings.warn("The specified interval start ("+str(interval_start)+") begins before the first position of the chromosome and has been substituted ("+str(min(pos_array))+")")
warnings.warn("[pixy] WARNING: The specified interval start ("+str(interval_start)+") begins before the first position of the chromosome and has been substituted ("+str(min(pos_array))+")")
interval_start = min(pos_array)

try:
if ((interval_end - interval_start + 1) < window_size):
raise ValueError()
except ValueError as e:
raise Exception("The specified interval ("+str(interval_start)+"-"+str(interval_end)+") is too small for the requested window size ("+str(window_size)+")") from e
raise Exception("[pixy] ERROR: The specified interval ("+str(interval_start)+"-"+str(interval_end)+") is too small for the requested window size ("+str(window_size)+")") from e

# PI:
# AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS
Expand Down Expand Up @@ -476,13 +502,12 @@ def dxyCompareGTs(vec1, vec2): #for dxy
# close output file and print complete message
outfile.close()

print("Pi calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_pi.txt")
print("[pixy] Pi calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_pi.txt")


# DXY:
# AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS


if (args.populations is not None) and ('dxy' in args.stats):

# create a list of all pairwise comparisons between populations in the popfile
Expand Down Expand Up @@ -530,7 +555,7 @@ def dxyCompareGTs(vec1, vec2): #for dxy
window_pos_2 = interval_end

outfile.close()
print("Dxy calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_dxy.txt")
print("[pixy] Dxy calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_dxy.txt")

# FST:
# WEIR AND COCKERHAMS FST
Expand Down Expand Up @@ -581,14 +606,14 @@ def dxyCompareGTs(vec1, vec2): #for dxy
for fst,wind,snps in zip(a, b, c):
outfile.write(str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" + str(chromosome) + "\t" + str(wind[0]) + "\t" + str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) +"\n")
outfile.close()
print("Fst calculations chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_fst.txt")
print("[pixy] Fst calculations for chromosome " + chromosome + " complete and written to " + args.outfile_prefix + "_fst.txt")




print("\nAll calculations complete!")
print("\n[pixy] All calculations complete!")
end_time = (time.time() - start_time)
print("Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(end_time)))
print("[pixy] Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(end_time)))



Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name='pixy',
version='0.93.3',
version='0.93.4',
packages=['pixy'],
entry_points={
'console_scripts': [
Expand Down

0 comments on commit 8d385f2

Please sign in to comment.