-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
222 lines (185 loc) · 8.28 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/env python3
"""
Author : Pierre Levy <[email protected]>
Date : 2022-02-23
Purpose: Process ribo-seq fastq files from Ribolace seq until STAR alignment
"""
import argparse
import os
import logging
from pipeline.common import *
# --------------------------------------------------
def get_args():
"""Get command-line arguments"""
parser = argparse.ArgumentParser(
description='Process ribo-seq fastq files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('sample',
metavar='sample',
help='sample name')
parser.add_argument('fastq',
metavar='fastq',
help='path to fastq')
parser.add_argument('-i',
'--rrnai',
metavar='\b',
help='Path to rRNA bowtie2 index prefix',
default='refs/rRNA')
parser.add_argument('-j',
'--trnai',
metavar='\b',
help='Path to tRNA bowtie2 index prefix',
default='refs/tRNA')
parser.add_argument('-l',
'--lowlen',
metavar='\b',
help='lower length filter RiboWaltz',
default='28')
parser.add_argument('-u',
'--uplen',
metavar='\b',
help='upper length filter RiboWaltz',
default='36')
parser.add_argument('-r',
'--rds',
metavar='\b',
help='Path to RiboWalzt annotation RDS')
parser.add_argument('-s',
'--stari',
metavar='\b',
help='Path to STAR index')
parser.add_argument('-g',
'--gtf',
metavar='\b',
help='Path to GTF file')
parser.add_argument('-a',
'--annotate',
help='optionnal: runs riboWalz create_annotation(gtf)',
action='store_true')
parser.add_argument('-T',
'--threads',
help='Number of threads to use',
metavar='\b', # metavar \b to not show any metavar except the short and long flag
type=int,
default=4)
return parser.parse_args()
# --------------------------------------------------
def main():
"""Main function"""
# assign arguments variable names
args = get_args()
sample = args.sample
fastq = args.fastq
threads = args.threads
rrna_i = args.rrnai
trna_i = args.trnai
starindex = args.stari
gtf = args.gtf
ribowaltz_annotate = args.annotate
lowlen = args.lowlen
uplen = args.uplen
annotRds = args.rds
# create output directory out
if not os.path.exists(f'{os.getcwd()}/{sample}_out'):
os.mkdir(f'{os.getcwd()}/{sample}_out') # create dir
outdir = f'{os.getcwd()}/{sample}_out' # create outdir variable
# Create log file in outdir (called riboseq.log)
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%d-%b-%y %H:%M:%S',
level=logging.DEBUG,
handlers=[
logging.FileHandler(f"{outdir}/riboseq.log"),
logging.StreamHandler() # these 2 handlers allow 1) to have the log file created and 2) to stream to the terminal
])
logger = logging.getLogger() # creates riboseq logger to add entries to the log
# Log start time
logger.info("Start Ribo-Seq Process")
# qc for raw fastqs
logger.info("****** Step 1 = Fastqc ******")
if not os.path.exists(f'{outdir}/qc'):
os.mkdir(f'{outdir}/qc') # create dir
cmd = f'fastqc --nogroup --threads {threads} {fastq} -o {outdir}/qc' # define shell command
exec_command(cmd) # execute shell command (see above for exec_command function def)
# cutadapt (1)
logger.info("****** Step 2 = Cutadapt (1) ******")
cmd = f'cutadapt --minimum-length 20 -a TCTCCTTGCATAATCACCAACC --discard-untrimmed --cores={threads} \
-o {outdir}/trim.fastq {fastq} >> {outdir}/riboseq.log'
exec_command(cmd)
# umi_tools (extract UMIs)
logger.info("****** Step 3 = Umi_tools (extract UMIs) ******")
cmd = f"umi_tools extract -I {outdir}/trim.fastq --bc-pattern='(?P<umi_1>.{{4}}).+(?P<umi_2>.{{4}})$' \
--extract-method=regex -S {outdir}/extract.fq"
exec_command(cmd)
# cutadapt (2)
logger.info("****** Step 4 = Cutadapt (2) ******")
cmd = f"cutadapt --cut 1 -o {outdir}/trim2.fq {outdir}/extract.fq --cores={threads} >> {outdir}/riboseq.log"
exec_command(cmd)
# Bowtie2 exclusion of rRNA reads ******* CHANGE PATH TO INDEX!!! MAKE VARIABLE FLAG *********
logger.info("****** Step 5 = Bowtie2 exclusion of rRNA reads ******")
cmd = f"bowtie2 -p {threads} -N 1 --no-1mm-upfront \
-q {outdir}/trim2.fq \
--un {outdir}/no_rRNA.fq \
-x {rrna_i} \
> /dev/null 2> >(tee -a {outdir}/riboseq.log >&2)" #sends stdout to null, stderr to log file and redirects it to stderr to show on terminal screen
exec_command(cmd)
# Bowtie2 exclusion of tRNA reads
logger.info("****** Step 6 = Bowtie2 exclusion of tRNA reads ******")
cmd = f"bowtie2 -p {threads} -N 1 --no-1mm-upfront \
-q {outdir}/no_rRNA.fq \
--un {outdir}/no_rRNA_no_tRNA.fq \
-x {trna_i} \
> /dev/null 2> >(tee -a {outdir}/riboseq.log >&2)" #sends stdout to null, stderr to log file and redirects it to stderr to show on terminal screen
exec_command(cmd)
# STAR mapping to genome
logger.info("****** Step 7 = STAR mapping to genome ******")
cmd = f"STAR --genomeDir {starindex} \
--readFilesIn {outdir}/no_rRNA_no_tRNA.fq \
--runThreadN {threads} \
--outSAMtype BAM SortedByCoordinate \
--quantMode TranscriptomeSAM \
--alignEndsType EndToEnd \
-outFilterMultimapNmax 1 \
--sjdbGTFfile {gtf} \
--alignIntronMin 20 \
--alignIntronMax 100000 \
--outFilterMismatchNmax 1 \
--outFilterType BySJout \
--outFilterMismatchNoverLmax 0.04 \
--twopassMode Basic \
--outFileNamePrefix {outdir}/"
exec_command(cmd)
# Samtools index
logger.info("****** Step 8 = Samtools index of STAR output bam ******")
cmd = f"samtools index {outdir}/Aligned.sortedByCoord.out.bam {outdir}/Aligned.sortedByCoord.out.bam.bai"
exec_command(cmd)
# umi_tools (remove duplicates)
logger.info("****** Step 9 = Umi_tools (remove duplicates) ******")
cmd = f"umi_tools dedup -I {outdir}/Aligned.sortedByCoord.out.bam -S {outdir}/{sample}.dedup.bam \
--method=directional --log2stderr 2> >(tee -a {outdir}/riboseq.log >&2)"
exec_command(cmd)
# Samtools index 2
logger.info("****** Step 10 = Samtools index final bam ******")
cmd = f"samtools index {outdir}/{sample}.dedup.bam {outdir}/{sample}.dedup.bam.bai"
exec_command(cmd)
# Delete temporary files
temp_files = ["Aligned.sortedByCoord.out.bam", "Aligned.sortedByCoord.out.bam.bai", \
"trim.fastq", "extract.fq", "trim2.fq", "no_rRNA.fq", "no_rRNA_no_tRNA.fq"]
for file in temp_files:
if os.path.exists(f"{outdir}/{file}"):
os.remove(f"{outdir}/{file}")
# RiboWaltz
if ribowaltz_annotate:
logger.info("****** Step 11 = RiboWaltz annotate ******")
cmd = f"Rscript /RiboNeo/ribowaltz/ribowaltz_annot.R {gtf} {outdir}"
exec_command(cmd)
logger.info("****** Step 12 = RiboWaltz ******")
cmd = f"Rscript /RiboNeo/ribowaltz/ribowaltz.R {sample} {outdir} {outdir}/annotation_ribowaltz.rds {lowlen} {uplen} {outdir}"
else:
logger.info("****** Step 11 = RiboWaltz ******")
cmd = f"Rscript /RiboNeo/ribowaltz/ribowaltz.R {sample} {outdir} {annotRds} {lowlen} {uplen} {outdir}"
exec_command(cmd)
# THE END
logger.info(f"****** Preprocessing and RiboWaltz steps completed! ******")
# --------------------------------------------------
if __name__ == '__main__':
main()