-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathriborf.py
205 lines (166 loc) · 7.58 KB
/
riborf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
"""
Author : Pierre Levy <[email protected]>
Date : 2022-02-23
Purpose: Riborf
requirements:
Bam file from the ribolace_process.py script (STAR-aligned)
Samtools
gtfToGenePred to make Transcript annotation file in genePred format
Genome assembly file in Fasta format
Linux high performance computing cluster
Perl program installation
R program installation in the PATH
RibORF package: https://github.com/zhejilab/RibORF/
RUN pipeline from the RibORF directory
"""
import argparse
import os
import shutil
import logging
import re
from pipeline.common import *
# --------------------------------------------------
def get_args():
"""Get command-line arguments"""
parser = argparse.ArgumentParser(
description='RibORF analysis of translating ORFs',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('sample',
metavar='sample',
help='sample name')
parser.add_argument('bam',
metavar='bam',
help='bam output from STAR alignment')
parser.add_argument('-g',
'--genome',
metavar='\b',
help='Path to genome ref fasta file',
default='/ref/GRCh38.primary_assembly.genome.fa')
parser.add_argument('-p',
'--genePred',
metavar='\b',
help='Path to genePred annotation file')
parser.add_argument('-t',
'--transcriptGenePred',
metavar='\b',
help='Path to cDNA genePred annotation file',
default='/ref/gencode.v39.protein_coding.genePred')
parser.add_argument('-o',
'--output',
metavar='\b',
help='Path to output dir',
default='~/Riborf_out')
parser.add_argument('-d',
'--readlength',
metavar='\b',
help='read lengths to consider for readDist step',
default='32,33,34,35,36')
parser.add_argument('-f',
'--offset',
metavar='\b',
help='optionnal: provide to create custom offset.correction.parameters.txt',
default='13,13,13,13,13')
parser.add_argument('-c',
'--candidates',
metavar='\b',
help='path to candidateORF.genepred.txt')
parser.add_argument('-r',
'--gtfToGenePred',
help='optionnal: runs gtfToGenePred on genome annotation file',
action='store_true')
parser.add_argument('-w',
'--gtf',
help='optionnal: gtf annotation file if gtfToGenePred step required',
action='store_true')
parser.add_argument('-a',
'--annotate',
help='optionnal: runs ORF annotate step',
action='store_true')
return parser.parse_args()
# --------------------------------------------------
def main():
"""Main function"""
# assign arguments variable names
args = get_args()
sample = args.sample
bam = args.bam
genome = args.genome
gtf = args.gtf
genePred = args.genePred
transcript = args.transcriptGenePred
readlength = args.readlength
orf_annotate = args.annotate
candidates = args.candidates
output = args.output
offset = args.offset
gtfToGenePred = args.gtfToGenePred
# create output directory
outdir = f'{output}/{sample}_out' # create outdir variable
if not os.path.exists(outdir):
os.mkdir(outdir) # create dir
# Create log file (called riborf.log)
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%d-%b-%y %H:%M:%S',
level=logging.DEBUG,
handlers=[
logging.FileHandler(f"{outdir}/riborf.log"),
logging.StreamHandler() # these 2 handlers allow 1) to have the log file created and 2) to stream to the terminal
])
logger = logging.getLogger() # creates riboseq logger to add entries to the log
# ******************************************************************************************
# Log start time
logger.info("Start RibORF pipeline")
if gtfToGenePred:
genePred = re.sub("\.gtf.*", "", gtf) + ".genePred"
cmd = f'gtfToGenePred {gtf} {genePred}'
if orf_annotate:
# ORFannotate.pl
logger.info("****** Generate Candidate ORFs ******")
cmd = f'perl ORFannotate.pl -g {genome} -t {genePred} -o {outdir}'
exec_command(cmd)
# Convert STAR Bam file to Sam file, input for readDist script
logger.info("****** Step 1 = Convert STAR Bam file to Sam file, input for readDist script ******")
cmd = f"samtools view -h -o {outdir}/{sample}.dedup.sam {bam}"
exec_command(cmd)
# ReadDist.pl on Star output
logger.info("****** Step 2 = readDist.pl using protein coding genePred ******")
cmd = f"perl readDist.pl -f {outdir}/{sample}.dedup.sam -g {transcript} -o {outdir} -d {readlength}"
exec_command(cmd)
# Create offset.correction.parameters.txt if different from default or copy it from pipeline dir if default
if offset != "13,13,13,13,13":
offsetCorrectionFile = open(f"{outdir}/offset.correction.parameters.txt", "w")
readlengths=readlength.split(",")
offsets=offset.split(",")
offsetCorrection=""
for rl,off in zip(readlengths,offsets):
offsetCorrection = offsetCorrection + rl + "\t" + off + "\n"
offsetCorrectionFile.write(offsetCorrection)
offsetCorrectionFile.close()
else:
shutil.copyfile("/RiboNeo/riborf/offset.correction.parameters.txt", f"{outdir}/offset.correction.parameters.txt")
# offsetCorrect.pl: correct read locations based on offset distances between 5’ ends and ribosomal A-sites
logger.info("****** Step 3 = offsetCorrect.pl ******")
cmd = f"perl offsetCorrect.pl -r {outdir}/{sample}.dedup.sam -p {outdir}/offset.correction.parameters.txt -o {outdir}/{sample}.offset_corrected.sam"
exec_command(cmd)
# readDist.pl on offset-corrected reads
logger.info("****** Step 4 = readDist.pl on offset-correctd reads ******")
cmd = f"perl readDist.pl -f {outdir}/{sample}.offset_corrected.sam -g {transcript} -o {outdir} -d 1"
exec_command(cmd)
# ribORF.pl to identify translated ORFs
logger.info("****** Step 5 = ribORF.pl to identify translated ORFs ******")
if orf_annotate:
cmd = f"perl ribORF.pl -f {outdir}/{sample}.offset_corrected.sam -c {outdir}/candidateORF.genepred.txt -o {outdir}"
else:
cmd = f"perl ribORF.pl -f {outdir}/{sample}.offset_corrected.sam -c {candidates} -o {outdir}"
exec_command(cmd)
logger.info("****** ribORF.pl done ******")
# Create offset-corrected bam file from sam file
logger.info("****** Step 6 = Create offset-corrected bam file from sam file and index ******")
cmd = f"samtools sort {outdir}/{sample}.offset_corrected.sam | samtools view -Sb -o {outdir}/{sample}.offset_corrected.bam \
&& samtools index {outdir}/{sample}.offset_corrected.bam"
exec_command(cmd)
logger.info("****** THE END ******")
# --------------------------------------------------
if __name__ == '__main__':
main()