-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathseqmonk_import
executable file
·460 lines (340 loc) · 12.1 KB
/
seqmonk_import
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
#!/usr/bin/perl --
use warnings;
use strict;
use English;
use FindBin qw($RealBin);
use Getopt::Long;
use IPC::Open3;
# Quick check that they've supplied some arguments
unless (@ARGV) {
die "SeqMonk non-interactive project creator\nUsage is seqmonk_import --outfile [out.smk] --genome [Species/assembly] [file1.bam] [file2.bam]\nRun seqmonk_import --help for more options\n";
}
# Check to see if they've mistakenly downloaded the source distribution
# since several people have made this mistake
if (-e "$RealBin/uk/ac/babraham/SeqMonk/SeqMonkApplication.java") {
die "This is the source distribution of SeqMonk. You need to get the compiled version if you want to run the program\n";
}
if ($ENV{CLASSPATH}) {
$ENV{CLASSPATH} .= ":$RealBin:$RealBin/htsjdk.jar:$RealBin/Jama-1.0.2.jar:$RealBin/commons-math3-3.5.jar";
}
else {
$ENV{CLASSPATH} = "$RealBin:$RealBin/htsjdk.jar:$RealBin/Jama-1.0.2.jar:$RealBin/commons-math3-3.5.jar";
}
# Temp for windows stuff...
#$ENV{CLASSPATH} =~ s/:/;/g;
# We need to make sure the language is English so we can parse the output of some
# standard commands, however if we leave this in place it can break things for people
# in other locales so we need to restore the original value.
my $original_lang = undef;
if (exists $ENV{LANG}) {
$original_lang = $ENV{LANG};
}
$ENV{LANG}='C';
# We need to find the java interpreter. We'll start from the assumption that this
# is included in the path.
my $java_exe = "java";
# We might have bundled a jre with the installation. If that's the case then we'll
# use the interpreter which is bundled in preference to the system one.
# Windows first
if (-e "$RealBin/jre/bin/java.exe") {
$java_exe = "$RealBin/jre/bin/java.exe";
}
# Linux
elsif (-e "$RealBin/jre/bin/java") {
$java_exe = "$RealBin/jre/bin/java";
}
# OSX
elsif (-e "$RealBin/jre/Contents/Home/bin/java") {
$java_exe = "$RealBin/jre/Contents/Home/bin/java";
}
warn "Java interpreter is '$java_exe'\n";
warn `\"$java_exe\" -version`."\n";
my @java_args= ("-Djava.awt.headless=true");
# We now need to scan the command line for switches which we're going
# to pass on to the main java program.
my $help;
my $memory;
my $mapq;
my $genome;
my $outfile;
my $spliced;
my $introns;
my $unspliced;
my $result = GetOptions('help' => \$help,
'memory=i' => \$memory,
'genome=s' => \$genome,
'outfile=s' => \$outfile,
'spliced' => \$spliced,
'introns' => \$introns,
'unspliced' => \$unspliced,
'mapq=i' => \$mapq,
);
my @files = @ARGV;
# Check the simple stuff first
if ($help) {
# Just print the help and exit
print while(<DATA>);
exit;
}
unless (@files) {
die "No files to import specified\n";
}
unless ($outfile) {
die "No output file (--outfile) supplied\n";
}
unless ($genome) {
die "No genome (--genome) supplied\n";
}
unless ($genome =~ /^[^\/]+\/[^\/]+$/) {
die "Genome '$genome' didn't appear to have the correct 'species/assembly' structure\n";
}
if (defined $mapq) {
if ($mapq < 0 or $mapq > 255) {
die "MAPQ ($mapq) must be in the range 0-255\n";
}
}
else {
# We set this automatically
$mapq = -1;
}
if ($unspliced and $spliced) {
die "You can't set both spliced and unspliced on the same import\n";
}
if ($unspliced and $introns) {
die "You can't set both introns and unspliced on the same import\n";
}
# Introns implies spliced even if they didn't say it
if ($introns) {
$spliced = 1;
}
if ($memory) {
if ($memory < 500) {
die "Memory allocation must be at least 500M";
}
}
else {
$memory = determine_optimal_memory();
}
# We now need to correct for the fact that java doesn't honour the
# heap size you set
my $actual_memory = correct_memory($memory);
warn "Correcting for VM actual requested allocation for $memory is $actual_memory\n";
unshift @java_args,"-Xmx${actual_memory}m";
# Expand the default stack size to allow for large recursive sorts
unshift @java_args,"-Xss4m";
# Work out the value we're using for splicing, 0 = auto, 1 = unspliced, 2 = spliced, 3=spliced_introns
my $spliced_value = 0;
$spliced_value = 1 if ($unspliced);
if ($spliced) {
if ($introns) {
$spliced_value = 3;
}
else {
$spliced_value = 2;
}
}
# Put the environment back to how we found it
if (defined $original_lang) {
$ENV{LANG} = $original_lang;
}
else {
delete $ENV{LANG};
}
exec $java_exe ,@java_args, "uk.ac.babraham.SeqMonk.Importer.SeqMonkImporter", $genome, $outfile, $spliced_value,$mapq, @files;
sub print_error {
# We wrap errors like this so we can keep a windows shell window open
# so the user can see any errors we generate
my ($error) = @_;
warn $error;
$_ = <STDIN>;
exit 1;
}
sub correct_memory {
my ($requested_memory) = @_;
my $actual_memory = `$java_exe -Xmx${requested_memory}m uk.ac.babraham.SeqMonk.Utilities.ReportMemoryUsage`;
if ($actual_memory =~ /^(\d+)/) {
$actual_memory = $1;
$actual_memory = int($requested_memory * ($requested_memory/$actual_memory));
return $actual_memory;
}
else {
warn "Failed to correct requested memory. Request output was '$actual_memory'";
return $requested_memory;
}
}
sub determine_optimal_memory {
# We'll set a ceiling for the memory allocation. On a 32-bit OS this is going
# to be 1536m (the max it can safely handle), on a 64-bit OS we won't take more
# than 6GB
my $max_memory = 1331;
# We need not only a 64 bit OS but 64 bit java as well. It's easiest to just test
# java since the OS support must be there if you have a 64 bit JRE.
my $is_java_7 = 0;
my ($in,$out);
open3(\*IN,\*OUT,\*OUT,"$java_exe -version") or print_error("Can't find java");
close IN;
while (<OUT>) {
if (/64-Bit/) {
$max_memory = 10240;
}
if (/build 1\.7/) {
$is_java_7 = 1;
}
}
close OUT;
warn "Memory ceiling is $max_memory\n";
# The way we determine the amount of physical memory is OS dependent.
my $os = $^O;
my $physical;
if ($os =~ /Win/) {
$physical = get_windows_memory($max_memory);
}
elsif ($os =~/darwin/) {
$physical = get_osx_memory($max_memory);
# Add the OSX specific options to use a standard OSX menu bar
# and set the program name to something sensible.
push @java_args, '-Xdock:name=SeqMonk';
push @java_args, "-Xdock:icon=$RealBin/../Resources/seqmonk.icns";
# Java7 on OSX has a bug in the apple screen bar which prevents the
# menu items from being updated correctly resulting in them staying
# disabled when they should be enabled. We'll only enable this view
# when the JRE is an older version.
# Update - this seems to be fixed in newer versions of java 1.7 so
# we'll revert this
# unless ($is_java_7) {
push @java_args, '-Dapple.laf.useScreenMenuBar=true';
# }
}
elsif ($os =~ /bsd/) {
$physical = get_osx_memory($max_memory);
}
else {
$physical = get_linux_memory($max_memory);
}
warn "Raw physical memory is $physical\n";
# We then set the memory to be the minimum of 2/3 of the physical
# memory or the ceiling, whichever is lower.
$physical = int(($physical/3)*2);
if ($max_memory < $physical) {
return $max_memory;
}
warn "Using $physical MB of RAM to launch seqmonk\n";
return $physical;
}
sub get_linux_memory {
# We get the amount of physical memory on linux by parsing the output of free
open (MEM,"free -m |") or print_error("Can't launch free on linux: $!");
while (<MEM>) {
if (/^Mem:\s+(\d+)/) {
return $1;
}
}
close MEM;
print_error("Couldn't parse physical memory from the output of free");
}
sub get_osx_memory {
# We get the amount of physical memory on OSX by parsing the output of top
open (MEM,"top -l 1 -n 0 |") or print_error("Can't get amount of memory on OSX: $!");
# Output from pre-mavericks OSX looks like this:
#
# Processes: 143 total, 5 running, 2 stuck, 136 sleeping, 842 threads
# 2013/10/24 08:22:58
# Load Avg: 1.47, 1.12, 0.90
# CPU usage: 2.12% user, 10.63% sys, 87.23% idle
# SharedLibs: 1040K resident, 0B data, 0B linkedit.
# MemRegions: 33232 total, 2277M resident, 99M private, 781M shared.
# PhysMem: 1650M wired, 3229M active, 1137M inactive, 6016M used, 2168M free.
# VM: 306G vsize, 1026M framework vsize, 5152376(0) pageins, 0(0) pageouts.
# Networks: packets: 3155246/3187M in, 2197390/1031M out.
# Disks: 214828/7466M read, 673736/15G written.
#
# In Mavericks it changed to look like this:
#
# Processes: 172 total, 3 running, 6 stuck, 163 sleeping, 817 threads
# 2013/10/24 08:27:53
# Load Avg: 1.42, 1.78, 2.59
# CPU usage: 11.66% user, 11.66% sys, 76.66% idle
# SharedLibs: 12M resident, 14M data, 0B linkedit.
# MemRegions: 28889 total, 2123M resident, 104M private, 371M shared.
# PhysMem: 6442M used (1287M wired), 1743M unused.
# VM: 388G vsize, 1065M framework vsize, 104(0) swapins, 126(0) swapouts.
# Networks: packets: 2711222/3332M in, 387599/29M out.
# Disks: 340088/8060M read, 174291/11G written.
#
# We're parsing the PhyMem line in each case to get the full amount of
# system memory.
my $total_mem = 0;
while (<MEM>) {
if (/^PhysMem:.*?(\d+)([MG])\s+used.*?(\d+)([MG])\s+(free|unused)/) {
if ($2 eq 'G') {
$total_mem += $1 * 1024;
}
else {
$total_mem += $1;
}
if ($4 eq 'G') {
$total_mem += $3 * 1024;
}
else {
$total_mem += $3;
}
}
}
close MEM;
unless ($total_mem) {
print_error("Could't parse physical memory from the output of top");
}
return $total_mem;
}
sub get_windows_memory {
warn "Getting windows physical memory\n";
# This code was adapted from an answer posted by Tom Feiner on
# stackoverflow
#
# http://stackoverflow.com/questions/423797/how-can-i-find-the-exact-amount-of-physical-memory-on-windows-x86-32bit-using-per
my ($max_memory) = @_;
eval {
require Win32::OLE;
Win32::OLE->import (qw( EVENTS HRESULT in ));
1;
} or do {
print_error("Couldn't load Win32 module to determine windows memory");
};
my $WMI = Win32::OLE->GetObject( "winmgmts:{impersonationLevel=impersonate,(security)}//./" ) || print_error ("Could not get Win32 object: $OS_ERROR");
my $total_capacity = 0;
foreach my $object (in($WMI->InstancesOf( 'Win32_PhysicalMemory' ))) {
$total_capacity += $object->{Capacity};
}
my $total_capacity_in_mb = $total_capacity / (1024*1024);
return $total_capacity_in_mb;
}
__DATA__
SeqMonk Importer - Creating SeqMonk Projects from the command line
SYNOPSIS
seqmonk_import [--(un)spliced] [--mapq=20] --genome "Mus musculus/GRCm38" --outfile out.smk *.bam
DESCRIPTION
This script allows you to run seqmonk in a non-interactive mode to read in
a number of BAM or Bismark coverage files and save these into a single project
file which you can then transfer to an interactive server to do further
downstream analysis.
The options for the program as as follows:
--genome The genome to use for the import. This is specified as
species/assembly and must match an existing genome in your
seqmonk genomes folder.
--outfile The name of the file you want to write the project to
--spliced Split spliced reads so you only see the exonic parts. Will
be added by default if any of the first 100,000 reads in
the first imported BAM file have a splice site in them.
Adding this flag overrides the auto-detection.
--unspliced No not split spliced reads even if they are present. This
flag overrides the default auto-detection.
--introns When importing with --spliced import introns rather than exons
--mapq Value to use as a MAPQ cutoff for imported reads. Defaults
to 20 if any of the first 100,000 reads has a value above 20.
-h --help Print this help file and exit
-m --memory Set the starting memory allocation in megabytes. Defaults
to 1300. Minimum allowed value is 500 and values above
1300 should only be set on systems running a 64-bit JRE
BUGS
Any bugs in SeqMonk should be reported either to [email protected]
or in the github issue tracker.