-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrunRetroviral.py
178 lines (146 loc) · 6.46 KB
/
runRetroviral.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# runRetroviral.py
# 3/27/2016
# Makes histograms and count tables for retroviral data; requires either a
# trimmed FASTQ file or a STAR-generated BAM file as input
import sys
import os
import argparse
import pandas as pd
import numpy as np
import collections
import csv
csv.register_dialect("textdialect", delimiter='\t')
### Parsing arguments ###
parser = argparse.ArgumentParser(description="runRetroviral.py", epilog="Example: python runRetroviral.py ")
parser.add_argument('-i', metavar='INPUT', help="Input fastq or bam file", required=True)
group = parser.add_mutually_exclusive_group()
group.add_argument('--fastq', action='store_true', help="required if your input is a trimmed FASTQ or FASTQ.GZ")
group.add_argument('--bam', action='store_true', help="required if your input is a trimmed BAM")
group2 = parser.add_mutually_exclusive_group()
group2.add_argument('--human', action='store_true', help="required if your file is from human")
group2.add_argument('--mouse', action='store_true', help="required if your file is from mouse")
parser.add_argument('--compressed', action='store_true', help="required if your input is *.gz")
parser.add_argument('--iclip', action='store_true', help="specify if file is from CLIP data")
parser.add_argument('-p', metavar='PREFIX', help="Prefix of output files", required=True)
parser.add_argument('-sr', metavar='STAR_RATIO', type=float, help="Maximum mismatches per base allowed for STAR genome mapping (corresponds to outFilterMismatchNoverLmax). Default is 0.08 (2 mismatches per 25 mapped bases).", default=0.08)
# organism
args = parser.parse_args()
if not (args.human or args.mouse):
print "Error: must include --human or --mouse. Exiting."
exit()
if args.human: org = 'human'
else: org = 'mouse'
# filetype
if not (args.fastq or args.bam):
print "Error: must include --fastq or --bam. Exiting."
exit()
if args.fastq: filetype = 'fastq'
else: filetype = 'bam'
# other arguments
compressed = True if args.compressed else False
iclip = True if args.iclip else False
ifile = args.i
prefix = args.p
star_ratio = args.sr
home = "/home/raflynn/CLIP/"
if org == 'human':
repeat_index = home + '/docs/hg19/repeat/rep_spaced' # bt2 index for repeat RNA.
retro_index = home + '/docs/hg19/retroviral/'
retro_annotation = retro_index + 'Hs_retroviralIndex_spaced_positions.txt'
elif org == 'mouse':
repeat_index = home + '/docs/mm9/repeat/rep_spaced' # bt2 index for repeat RNA.
retro_index = home + '/docs/mm9/retroviral/'
retro_annotation = retro_index + 'Mm_retroviralIndex_spaced_positions.txt'
def make_bam(ifile):
# map to repeat
print "Mapping fastq to repeat"
rep_unmapped = prefix + "_repeat_unmapped.fastq"
if compressed or ifile[-3:] == '.gz':
cmd = "zcat {} | bowtie2 -p 8 -x {} - --un {} > /dev/null".format(ifile, repeat_index, rep_unmapped)
else:
cmd = "cat {} | bowtie2 -p 8 -x {} - --un {} > /dev/null".format(ifile, repeat_index, rep_unmapped)
os.system(cmd)
# map to retro
print "Mapping fastq to retroviral"
retro_prefix = prefix + "_retro_"
retro_mapped = prefix + "_retro_mapped.sam"
cmd = "STAR --genomeDir {} --runThreadN 8 --genomeLoad LoadAndKeep --readFilesIn {} --outFileNamePrefix {} --alignEndsType EndToEnd --outFilterMismatchNoverLmax {}".format(retro_index, rep_unmapped, retro_prefix, star_ratio)
os.system(cmd)
os.system("mv {} {}".format(retro_prefix + "Aligned.out.sam", retro_mapped))
# make bam file
print "Making BAM file"
bamfile_sort = prefix + "_retro_mapped.bam"
cmd = "cat {} | samtools view -Su - -o - | samtools sort - {}".format(retro_mapped, bamfile_sort)
os.system(cmd)
return bamfile_sort
def make_hist(bed):
data = pd.read_csv(bed, sep='\t', names=['name', 'start', 'end', 'read', 'qual', 'strand'])
nums = []
for i in data.iterrows():
nums.extend(range(i[1].start, i[1].end))
h = np.histogram(nums, bins=np.max(nums)-np.min(nums))[0]
hist = list(np.zeros(np.min(nums)))
hist.extend(list(h))
return np.array(hist)
def print_list_of_hist(hist, retro):
ofn = prefix + "_retro_hist.txt"
with open(ofn, 'w') as ofile:
writer = csv.writer(ofile, 'textdialect')
numBins = 100
for region in retro.iterrows():
row = [region[1][0] + '__' + str(region[1].len)]
nums = hist[region[1].start:region[1].end]
intended_len = region[1].end - region[1].start
if len(nums) != intended_len:
nums = list(nums)
nums.extend([0]*(intended_len - len(nums)))
nums = np.array(nums)
a = map(lambda x: float(x)*len(nums)/numBins, range(numBins)) # convert my desired scale to the current scale
new_nums = np.interp(a, range(len(nums)), nums)
row.extend(list(new_nums))
writer.writerow(row)
def print_list_of_counts(bed, retro):
data = pd.read_csv(bed, sep='\t', names=['name', 'start', 'end', 'read', 'qual', 'strand'])
ofn = prefix + "_retro_cts.txt"
fam_to_count = collections.defaultdict(lambda: 0)
with open(ofn, 'w') as ofile:
writer = csv.writer(ofile, 'textdialect')
for region in retro.iterrows():
name = region[1][0] + '__' + str(region[1].len)
fam = name.split('__')[1]
ct = data[data.start >= region[1].start][data.start < region[1].end].shape[0]
fam_to_count[fam] += ct
row = [name, region[1].len, ct]
writer.writerow(row)
ofn = prefix + "_retro_family_cts.txt"
with open(ofn, 'w') as ofile:
writer = csv.writer(ofile, 'textdialect')
for f in fam_to_count:
writer.writerow([f, fam_to_count[f]])
def main():
# read in annotation
retro = pd.read_csv(retro_annotation, sep='\t', names=['name', 'len', 'start', 'end'])
# 0a. if FASTQ, then map to repeat, then map to retro, then make multi BAM
if filetype == 'fastq':
bam = make_bam(ifile)
else:
bam = ifile
# 1. with multi BAM make a histogram
# print "Making histograms for each retro element"
# bed = prefix + "_retro_mapped.bed"
# cmd = "bamToBed -i {} > {}".format(bam, bed)
# os.system(cmd)
# total_hist = make_hist(bed)
# print_list_of_hist(total_hist, retro)
# 2. make a uni BAM
unibam = prefix + "_retro_mapped_uni.bam"
cmd = "samtools view -b -F 0x100 {} -o {}".format(bam, unibam)
os.system(cmd)
# 3. with uni BAM make a file of counts (by family and by subfamily)
print "Counting reads for each retro element"
unibed = prefix + "_retro_mapped_uni.bed"
cmd = "bamToBed -i {} > {}".format(unibam, unibed)
os.system(cmd)
print_list_of_counts(unibed, retro)
if __name__ == "__main__":
main()