-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdownsampling.py
192 lines (170 loc) · 8.09 KB
/
downsampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
import logging
import argparse
import csv
import re
import subprocess
import os
import sys
import pathlib
import glob
def parse_args():
# arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", required=True, default=None,
help="Input CRAM or BAM file. Default: None.")
parser.add_argument("--output_dir", required=False, default="./",
help="Path to output directory. Default: ./")
parser.add_argument("--coverage", required=False, default="15",
help="Desired coverage for new BAM file. Default: 15")
parser.add_argument("--thread", required=False, default="4",
help="Number of threads to use (at least 4 for Mosdepth...deal with it). Default: 4")
parser.add_argument("--sample_name", required=False, default=None,
help="Sample Name to use, otherwise will default to CRAM files sample name. Useful when dealing with benchmarking data to avoid overwriting eg. SRR11321732_hg19 vs SRR11321732_GRCh38.")
parser.add_argument("--skip_dupRm", required=False, default=False, action="store_true",
help="Do not do a dup remove (directly downsample original BAM). Default: False.")
parser.add_argument("--tmp", required=False, default=None,
help="Temporary directory, deleted after processing. Default: ./sampleID/tmp ")
parser.add_argument("--loglevel", required=False, default="DEBUG",
help="Set logging level to DEBUG (default), INFO or WARNING.")
args = parser.parse_args()
# checks
if not os.path.exists(args.input_file):
logging.error("Input file not found: %s" % args.input_file)
sys.exit(1)
pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)
try:
subprocess.call(["which", "mosdepth"], stdout=subprocess.DEVNULL)
except:
logging.error("mosdepth not found")
sys.exit(1)
try:
subprocess.call(["which", "sambamba"], stdout=subprocess.DEVNULL)
except:
logging.error("sambamba not found")
sys.exit(1)
return args
def set_logging(loglevel):
numeric_level = getattr(logging, loglevel.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError("Invalid log level: %s" % loglevel)
logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level=numeric_level)
def check_type(input_file):
if input_file.endswith(('.cram','.CRAM')):
return "cram"
elif input_file.endswith(('.bam','.BAM')):
return "bam"
else:
# Not sure what are you processing, must DIE
logging.error("input_file is not bam or cram. Dying now!")
sys.exit(1)
def get_ID(input_file , sample_name):
if sample_name is not None:
return sample_name
else:
# assume sample ID is the first segment (sample1) in bam file named sample1.bqsr.sorted.whatever.bam
bam = input_file.split('/')[-1]
sampleID = bam.split('.')[0]
return sampleID
def temp_output_loc( out_dir , tmp , sampleID , cov):
if tmp is not None:
tempLoc = "%s/%s/%sx" % (tmp , sampleID , cov)
logging.info("Creating temp folder %s" % tempLoc)
pathlib.Path(tempLoc).mkdir(parents=True, exist_ok=True)
return tempLoc
else:
tempLoc = "%s/%s/tmp" % (out_dir , sampleID)
logging.info("Creating temp folder %s" % tempLoc)
pathlib.Path(args.out_dir).mkdir(parents=True, exist_ok=True)
return tempLoc
def remove_dup(input_file , tempLoc , file_type , sampleID , skip_dupRm , thread):
filterString = "not (unmapped or mate_is_unmapped) and paired and not duplicate"
cram = "-C"
logging.info("Sanitising input %s " % input_file)
if skip_dupRm :
logging.info("Not removing duplicates reads")
filterString = "not (unmapped or mate_is_unmapped) and paired"
if re.match("bam", file_type):
cram = ""
# run
# sambamba view -F 'not (unmapped or mate_is_unmapped) and paired and not duplicate' -q -C -f bam -t ${THREAD} -o ${OUT}/${SAMPLE}.dupRm.bam ${INPUT_CRAM}
cmd = "sambamba view -F '%s' -q %s -f bam -t %d -o %s/%s.dupRm.bam %s" % ( filterString , cram , int(thread) , tempLoc , sampleID , input_file )
if os.system(cmd) != 0:
logging.error("Sambamba sanitising bam/cram failed. Dying now.")
sys.exit(1)
else:
bamLoc = "%s/%s.dupRm.bam" % ( tempLoc , sampleID )
logging.info("Sanitised bam found at %s " % bamLoc)
return bamLoc
def get_coverage( Bam , sampleID , tempFolder , subfolder ):
# run
# mosdepth --no-per-base --threads 4 ${OUT_L} ${IN}
sub_tmp = "%s/%s" % ( tempFolder , subfolder )
pathlib.Path(sub_tmp).mkdir(parents=True, exist_ok=True)
cmd = "mosdepth --no-per-base --threads 4 %s/%s %s " % ( sub_tmp , sampleID , Bam)
if os.system(cmd) != 0:
logging.error("Get coverage %s failed. Dying now." % subfolder )
sys.exit(1)
summary_file = "%s/%s.mosdepth.summary.txt" % ( sub_tmp , sampleID)
mos_cov = open(summary_file, "r")
length = 0
bases = 0
for line in mos_cov :
# only counts for chr1-22 or 1-22
if re.search(r"^[0-9]{1,2}", line) or re.search(r"^chr[0-9]{1,2}", line):
array_line = re.split(r'\t+', line.strip())
length += int(array_line[1])
bases += int(array_line[2])
cov = bases/length
logging.info("%s coverage is : %f" % ( Bam , cov ) )
return cov
def get_fraction( neededCov , bamCov):
if neededCov >= bamCov :
logging.error("Needed coverage ( %f ) is more than the CRAM's coverage ( %f ): unable to downsample." % ( neededCov , bamCov))
sys.exit(1)
else:
fraction = float( neededCov ) / float( bamCov )
logging.info("Downsampling CRAM with factor: %f " % fraction)
return fraction
def downsample_bam(dupRemovedBam , output_dir , sampleID , fraction , seed , cov ):
# run
# sambamba view -s ${FACTOR} --subsampling-seed=${SEED} -f bam -o ${OUT_DOWNSAMPLING}/${SAMPLE}/${SAMPLE}.dupRm.subsam.bam ${OUT}/${SAMPLE}.dupRm.bam
out_prefix = "%s/%s/%sx" % ( output_dir , sampleID , cov )
pathlib.Path(out_prefix).mkdir(parents=True, exist_ok=True)
cmd = "sambamba view -s %f --subsampling-seed=%d -f bam -o %s/%s.dupRm.subsam.bam %s" % ( fraction , seed , out_prefix , sampleID , dupRemovedBam)
logging.info("Downsampling %s " % dupRemovedBam )
if os.system(cmd) != 0:
logging.error("Sambamba downsampling failed. Dying now.")
sys.exit(1)
else:
bamLoc = "%s/%s.dupRm.subsam.bam" % ( out_prefix , sampleID )
logging.info("Downsampled bam found at %s " % bamLoc)
return bamLoc
def cleanUp ( tempFolder ):
logging.info("Cleaning up, deleting %s" % tempFolder)
cmd = "rm -rf %s" % tempFolder
os.system(cmd)
logging.info("Done!")
if __name__ == "__main__":
# seed number to make result reproducible
seed = 10
args = parse_args()
set_logging(args.loglevel)
# check input is CRAM or BAM
file_type = check_type(args.input_file)
# get sample ID
sampleID = get_ID(args.input_file , args.sample_name )
# check and create intermediate ouput directory
tempFolder = temp_output_loc ( args.output_dir , args.tmp , sampleID , args.coverage)
# Remove dup and keep ONLY paired reads + inflat if CRAM
dupRemovedBam = remove_dup(args.input_file , tempFolder , file_type , sampleID , args.skip_dupRm , args.thread)
# Get coverage of input file - Round 1
cov = get_coverage( dupRemovedBam , sampleID , tempFolder , "R1")
# Check Coverage of input is larger than desired coverage, if true get "fraction" needed for downsampling
fraction = get_fraction( args.coverage , cov )
# Downsample to fraction
downsampledBam = downsample_bam(dupRemovedBam , args.output_dir , sampleID , fraction , seed , args.coverage )
# Get coverage for NEW BAM
newCov = get_coverage( downsampledBam , sampleID , tempFolder , "R2")
# clean up and Done
cleanUp( tempFolder )