Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release/1.0.0 #21

Merged
merged 2 commits into from
Aug 12, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Change Log

## [1.0.0] - 2015-08-11
### Added
- Minimum ORF length filtering
- Related options (Constant minimum length, percentage minimum length, adjustment for smaller read lengths)

### Fixed
- Translating edges of detected ORFs (previously were truncated)

### Changed
- A few minor description changes

## [0.2.1] - 2015-08-06
### Fixed
- Alignment stats shown at completion now properly account for supplementary alignments
Expand Down
16 changes: 10 additions & 6 deletions align.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,12 @@ int command_align(int argc, char *argv[]) {

aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
opt->proteinFlag |= ALIGN_FLAG_BRUTEORF; // Temporary

while ((c = getopt(argc, argv, "1epabgnFMCSPVYju:k:o:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
while ((c = getopt(argc, argv, "1epabgnMCSPVYJjf:F:u:k:o:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == 'u') opt->outputType = atoi(optarg);
//else if (c == 'o') opt->min_orf_len = atoi(optarg);
else if (c == 'f') opt->min_orf_len = atoi(optarg);
else if (c == 'F') opt->min_orf_percent = atof(optarg);
else if (c == 'o') prefixName = optarg;
else if (c == '1') no_mt_io = 1;
else if (c == 'x') mode = optarg;
Expand All @@ -137,9 +137,10 @@ int command_align(int argc, char *argv[]) {
else if (c == 'F') opt->flag |= MEM_F_ALN_REG;
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
else if (c == 'b') opt->proteinFlag &= ~ALIGN_FLAG_BRUTEORF;
else if (c == 'b') opt->proteinFlag &= ~ALIGN_FLAG_BRUTE_ORF;
else if (c == 'g') opt->proteinFlag |= ALIGN_FLAG_GEN_NT;
else if (c == 'n') opt->proteinFlag |= ALIGN_FLAG_KEEP_PRO;
else if (c == 'J') opt->proteinFlag &= ~ALIGN_FLAG_ADJUST_ORF;
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
else if (c == 'v') bwa_verbose = atoi(optarg);
Expand Down Expand Up @@ -389,9 +390,12 @@ int renderAlignUsage(const mem_opt_t * passOptions) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: paladin align [options] <idxbase> <in.fq>\n\n");

fprintf(stderr, "Protein detection options:\n\n");
fprintf(stderr, "Gene detection options:\n\n");
fprintf(stderr, " -b disable brute force ORF detection\n");
//fprintf(stderr, " -l INT minimum ORF length accepted during protein detection (DISABLED) [%d]\n", passOptions->min_orf_len);
fprintf(stderr, " -J do not adjust minimum ORF length (constant value) for shorter read lengths\n");
fprintf(stderr, " -f INT minimum ORF length accepted (as constant value) [%d]\n", passOptions->min_orf_len);
fprintf(stderr, " -F FLOAT minimum ORF length accepted (as percentage of read length) [%.2f]\n", passOptions->min_orf_percent);


fprintf(stderr, "\nAlignment options:\n\n");
fprintf(stderr, " -t INT number of threads [%d]\n", passOptions->n_threads);
Expand Down
9 changes: 5 additions & 4 deletions bwamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ mem_opt_t *mem_opt_init()
o->outputStream = stdout;
o->outputType = OUTPUT_TYPE_UNIPROT_FULL;
o->flag = 0;
o->proteinFlag = 0;
o->a = 1; o->b = 1; //4;
o->o_del = o->o_ins = 6;
o->e_del = o->e_ins = 1;
Expand All @@ -63,10 +62,12 @@ mem_opt_t *mem_opt_init()
o->zdrop = 100;
o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5;

o->max_mem_intv = 20;

o->min_orf_len = 180;
o->min_orf_len = 250;
o->min_orf_percent = 0;
o->proteinFlag = 0;
o->proteinFlag |= ALIGN_FLAG_BRUTE_ORF;
o->proteinFlag |= ALIGN_FLAG_ADJUST_ORF;
o->min_seed_len = 11;
o->split_width = 10;
o->max_occ = 500;
Expand Down
3 changes: 2 additions & 1 deletion bwamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ typedef struct {
int flag; // see MEM_F_* macros
int proteinFlag; // see ALIGN_FLAG_* protein-related defines
int indexFlag; // see INDEX_FLAG_* index/frame related defines
int min_orf_len; // minimum ORF length accepted during protein detection
int min_orf_len; // minimum ORF length accepted during protein detection (as constant value)
float min_orf_percent; // minimum orf length accepted during protein detection (as percent of read length)
int min_seed_len; // minimum seed length
int min_chain_weight;
int max_chain_extend;
Expand Down
44 changes: 26 additions & 18 deletions main.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
The MIT License

Copyright (c) 2015 by Anthony Westbrook, University of New Hampshire <[email protected]>
Copyright (c) 2011 by Attractive Chaos <[email protected]>

Permission is hereby granted, free of charge, to any person obtaining
Expand All @@ -22,27 +23,36 @@
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

Contact: Toni Westbrook <[email protected]>

/*
PALADIN (Protein Alignment and Detection Interface)

PALADIN is a protein sequence alignment tool based on the BWA source. Like BWA, it aligns
sequences via read-mapping using BWT. PALADIN, however, offers the novel approach
of aligning in the protein space. During the index phase, it processes the reference genome's
nucleotide sequences and GTF/GFF annotation containing CDS entries, first
converting these transcripts into the corresponding protein sequences, then creating the BWT
and suffix array from these proteins. During the alignment phase, it attempts to find ORFs in
the read sequences, then converts these to protein sequences, and aligns to the reference
protein sequences.

PALADIN currently only supports single-end reads, and BWA-MEM based alignment. It makes
use of many BWA parameters and is therefore compatible with many of its command line arguments.
PALADIN is a protein sequence alignment tool designed for the accurate
functional characterization of metagenomes.

PALADIN IS CURRENTLY ALPHA AND HAS NOT BEEN FULLY TESTED. USE AT YOUR OWN RISK.

For information regarding BWA, please contact its author, Heng Li <[email protected]>
PALADIN is based on BWA, and aligns sequences via read-mapping using
BWT. PALADIN, however, offers the novel approach of aligning in the
protein space. During the index phase, it processes the reference genome's
nucleotide sequences and GTF/GFF annotation containing CDS entries, first
converting these transcripts into the corresponding protein sequences, then
creating the BWT and suffix array from these proteins. The process of
translation is skiped when providing a protein reference file (e.g., UniProt)
for mapping. During the alignment phase, it attempts to find ORFs in the
read sequences, then converts these to protein sequences, and aligns to the
reference protein sequences.

PALADIN currently only supports single-end reads (or reads merged with FLASH,
PEAR, abyss-mergepairs), and BWA-MEM based alignment. It makes use of many
BWA parameters and is therefore compatible with many of its command line
arguments.

PALADIN may output a standard SAM file, or a text file containing a
UniProt-generated functional profile. This text file may be used for all
downstream characterizations.

Contact: Toni Westbrook <[email protected]>
For information regarding BWA, contact Heng Li <[email protected]>
*/

#include <stdio.h>
Expand Down Expand Up @@ -130,5 +140,3 @@ int renderVersion() {

return 1;
}


60 changes: 59 additions & 1 deletion main.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,66 @@
/*
The MIT License

Copyright (c) 2015 by Anthony Westbrook, University of New Hampshire <[email protected]>
Copyright (c) 2011 by Attractive Chaos <[email protected]>

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

/*
PALADIN (Protein Alignment and Detection Interface)

PALADIN is a protein sequence alignment tool designed for the accurate
functional characterization of metagenomes.

PALADIN is based on BWA, and aligns sequences via read-mapping using
BWT. PALADIN, however, offers the novel approach of aligning in the
protein space. During the index phase, it processes the reference genome's
nucleotide sequences and GTF/GFF annotation containing CDS entries, first
converting these transcripts into the corresponding protein sequences, then
creating the BWT and suffix array from these proteins. The process of
translation is skiped when providing a protein reference file (e.g., UniProt)
for mapping. During the alignment phase, it attempts to find ORFs in the
read sequences, then converts these to protein sequences, and aligns to the
reference protein sequences.

PALADIN currently only supports single-end reads (or reads merged with FLASH,
PEAR, abyss-mergepairs), and BWA-MEM based alignment. It makes use of many
BWA parameters and is therefore compatible with many of its command line
arguments.

PALADIN may output a standard SAM file, or a text file containing a
UniProt-generated functional profile. This text file may be used for all
downstream characterizations.

Contact: Toni Westbrook <[email protected]>
For information regarding BWA, contact Heng Li <[email protected]>
*/


#ifndef MAIN_H_
#define MAIN_H_

#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.2.1"
#define PACKAGE_VERSION "1.0.0"
#endif

// Render usage and version details
Expand Down
Loading