diff --git a/src/readfilter.cpp b/src/readfilter.cpp index 34d6dffc842..f24e84ce127 100644 --- a/src/readfilter.cpp +++ b/src/readfilter.cpp @@ -24,6 +24,8 @@ ostream& operator<<(ostream& os, const Counts& counts) { << "Min Quality Filter: " << counts.counts[Counts::FilterName::min_mapq] << endl << "Min Base Quality Filter: " << counts.counts[Counts::FilterName::min_base_qual] << endl << "Random Filter: " << counts.counts[Counts::FilterName::random] << endl + << "Annotation Filter: " << counts.counts[Counts::FilterName::annotation] << endl + << "Incorrectly Mapped Filter: " << counts.counts[Counts::FilterName::incorrectly_mapped] << endl << endl; return os; } @@ -45,4 +47,5 @@ bool ReadFilter::is_mapped(const MultipathAlignment& mp_alig return false; } + } diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 454983a4644..7c0fab58daa 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -90,6 +90,10 @@ class ReadFilter{ /// Sometimes we only want a report, and not a filtered gam. toggling off output /// speeds things up considerably. bool write_output = true; + /// Sometimes we want to pick out fields and write a tsv instead of a gam; + bool write_tsv = false; + /// What fields do we want to write to the tsv? + vector output_fields; /// A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this) const HandleGraph* graph = nullptr; /// Interleaved input @@ -102,6 +106,14 @@ class ReadFilter{ int min_base_quality = numeric_limits::min() / 2; // minimum fraction of bases in reads that must have quality at least double min_base_quality_fraction = numeric_limits::lowest(); + + /// A string formatted "annotation[.subfield]*:value" + /// Value is optional if the key is a flag + /// Used like jq select + string annotation_to_match = ""; + + /// Filter to only correctly mapped reads + bool only_correctly_mapped = false; /** * Run all the filters on an alignment. The alignment may get modified in-place by the defray filter @@ -243,7 +255,17 @@ class ReadFilter{ * Does the read contain at least one of the indicated sequences */ bool contains_subsequence(const Read& read) const; + + /** + * Does has the given annotation and does it match + */ + bool matches_annotation(const Read& read) const; + /** + * Check if the alignment is marked as being correctly mapped + */ + bool is_correctly_mapped(const Read& read) const; + /** * Write the read to stdout */ @@ -253,6 +275,12 @@ class ReadFilter{ * Write a read pair to stdout */ void emit(Read& read1, Read& read2); + + /** + * Write a tsv line for a read to stdout + */ + void emit_tsv(Read& read); + /// The twp specializations have different writing infrastructure @@ -268,7 +296,7 @@ struct Counts { // note: "last" must be kept as the final value in this enum enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang, min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered, - proper_pair, unmapped, last}; + proper_pair, unmapped, annotation, incorrectly_mapped, last}; vector counts; Counts () : counts(FilterName::last, 0) {} Counts& operator+=(const Counts& other) { @@ -326,8 +354,12 @@ void ReadFilter::filter_internal(istream* in) { #endif Counts read_counts = filter_alignment(read); counts_vec[omp_get_thread_num()] += read_counts; - if ((read_counts.keep() != complement_filter) && write_output) { - emit(read); + if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) { + if (write_tsv) { + emit_tsv(read); + } else { + emit(read); + } } }; @@ -343,10 +375,27 @@ void ReadFilter::filter_internal(istream* in) { read_counts.set_paired_any(); } counts_vec[omp_get_thread_num()] += read_counts; - if ((read_counts.keep() != complement_filter) && write_output) { - emit(read1, read2); + if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) { + if (write_tsv) { + emit_tsv(read1); + emit_tsv(read2); + } else { + emit(read1, read2); + } } }; + + if (write_tsv) { + //write a header for the tsv + cout << "#"; + for (size_t i = 0 ; i < output_fields.size() ; i++) { + const string& field = output_fields[i]; + cout << field; + if (i != output_fields.size()-1) { + cout << "\t"; + } + } + } if (interleaved) { vg::io::for_each_interleaved_pair_parallel(*in, pair_lambda); @@ -402,6 +451,18 @@ inline int ReadFilter::filter(istream* alignment_stream) { return 0; } +template<> +inline bool ReadFilter::is_correctly_mapped(const Alignment& alignment) const { + return alignment.correctly_mapped(); +} + +//Looks like multipath alignments don't have a field for this +template<> +inline bool ReadFilter::is_correctly_mapped(const MultipathAlignment& alignment) const { + throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped")); + return true; +} + template<> inline void ReadFilter::emit(Alignment& aln) { @@ -536,6 +597,19 @@ Counts ReadFilter::filter_alignment(Read& read) { keep = false; } } + if ((keep || verbose) && !annotation_to_match.empty()) { + if (!matches_annotation(read)) { + ++counts.counts[Counts::FilterName::annotation]; + keep = false; + } + } + + if ((keep || verbose) && only_correctly_mapped) { + if (!is_correctly_mapped(read)) { + ++counts.counts[Counts::FilterName::incorrectly_mapped]; + keep = false; + } + } if (!keep) { ++counts.counts[Counts::FilterName::filtered]; @@ -1286,6 +1360,107 @@ bool ReadFilter::contains_subsequence(const Read& read) const { return found; } +template +bool ReadFilter::matches_annotation(const Read& read) const { + + //Assume that there's only one level of annotations + size_t colon_pos = annotation_to_match.find(":"); + if (colon_pos == string::npos) { + //If there was no colon, then just check for the existence of the annotation + // or, if it is a boolean value, check that it's true + if (!has_annotation(read, annotation_to_match)) { + return false; + } + google::protobuf::Value value = read.annotation().fields().at(annotation_to_match); + if (value.kind_case() == google::protobuf::Value::KindCase::kBoolValue) { + return get_annotation(read, annotation_to_match); + } else { + return true; + } + } else { + string annotation_key = annotation_to_match.substr(0, colon_pos); + if (!has_annotation(read, annotation_key)) { + return false; + } else { + google::protobuf::Value value = read.annotation().fields().at(annotation_key); + string annotation_val = annotation_to_match.substr(colon_pos+1, annotation_to_match.size() - colon_pos - 1); + if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) { + double read_value = get_annotation(read, annotation_key); + return read_value == std::stod(annotation_val); + } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) { + return get_annotation(read, annotation_key) == annotation_val; + } else { + throw runtime_error("error: Cannot check equality of annotation " + annotation_to_match); + } + } + } + +} + +template<> +inline void ReadFilter::emit_tsv(MultipathAlignment& read) { + return; +} +template<> +inline void ReadFilter::emit_tsv(Alignment& read) { +#pragma omp critical (cout) + { + + cout << endl; + for (size_t i = 0 ; i < output_fields.size() ; i++) { + const string& field = output_fields[i]; + if (field == "name") { + cout << read.name(); + } else if (field == "correctly_mapped") { + if (is_correctly_mapped(read)) { + cout << "True"; + } else { + cout << "False"; + } + } else if (field == "correctness") { + if (is_correctly_mapped(read)) { + cout << "correct"; + } else if (has_annotation(read, "no_truth") && get_annotation(read, "no_truth")) { + cout << "off-reference"; + } else { + cout << "incorrect"; + } + } else if (field == "mapping_quality") { + cout << get_mapq(read); + } else if (field == "sequence") { + cout << read.sequence(); + } else if (field == "time_used") { + cout << read.time_used(); + } else if (field == "annotation") { + throw runtime_error("error: Cannot write all annotations"); + } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") { + if (!has_annotation(read, field.substr(11, field.size()-11))) { + throw runtime_error("error: Cannot find annotation "+ field); + } else { + string annotation_key = field.substr(11, field.size()-11); + google::protobuf::Value value = read.annotation().fields().at(annotation_key); + + if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) { + cout << get_annotation(read, annotation_key); + } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) { + cout << get_annotation(read, annotation_key); + } else { + cout << "?"; + } + } + } else { + cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl; + throw runtime_error("error: Writing non-existent field to tsv: " + field); + } + if (i != output_fields.size()-1) { + cout << "\t"; + } + } + + } +} + + template bool ReadFilter::sample_read(const Read& read) const { // Decide if the alignment is paired. diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index 7a233e6986e..af1c5678cbb 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -48,6 +48,7 @@ void help_filter(char** argv) { << " -x, --xg-name FILE use this xg index or graph (required for -S and -D)" << endl << " -v, --verbose print out statistics on numbers of reads filtered by what." << endl << " -V, --no-output print out statistics (as above) but do not write out filtered GAM." << endl + << " -T, --tsv-out FIELD[;FIELD] do not write filtered gam but a tsv of the given fields" << endl << " -q, --min-mapq N filter alignments with mapping quality < N" << endl << " -E, --repeat-ends N filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end" << endl << " -D, --defray-ends N clip back the ends of reads that are ambiguously aligned, up to N bases" << endl @@ -56,6 +57,9 @@ void help_filter(char** argv) { << " -i, --interleaved assume interleaved input. both ends will be filtered out if either fails filter" << endl << " -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters" << endl << " -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl + << " -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the values are equal" << endl + << " similar to running jq 'select(.annotation.K==V)' on the json" << endl + << " -c, --correctly-mapped keep only reads that are marked as correctly-mapped" << endl << " -U, --complement apply the complement of the filter implied by the other arguments." << endl << " -t, --threads N number of threads [1]" << endl; } @@ -105,6 +109,9 @@ int main_filter(int argc, char** argv) { bool complement_filter = false; bool only_proper_pairs = false; bool only_mapped = false; + string annotation = ""; + string output_fields = ""; + bool correctly_mapped = false; // What XG index, if any, should we load to support the other options? string xg_name; @@ -133,6 +140,7 @@ int main_filter(int argc, char** argv) { {"drop-split", no_argument, 0, 'S'}, {"xg-name", required_argument, 0, 'x'}, {"verbose", no_argument, 0, 'v'}, + {"tsv-out", no_argument, 0, 'T'}, {"min-mapq", required_argument, 0, 'q'}, {"repeat-ends", required_argument, 0, 'E'}, {"defray-ends", required_argument, 0, 'D'}, @@ -141,13 +149,15 @@ int main_filter(int argc, char** argv) { {"interleaved", no_argument, 0, 'i'}, {"interleaved-all", no_argument, 0, 'I'}, {"min-base-quality", required_argument, 0, 'b'}, + {"annotation", required_argument, 0, 'B'}, + {"correctly-mapped", no_argument, 0, 'c'}, {"complement", no_argument, 0, 'U'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:Ut:", + c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:", long_options, &option_index); /* Detect the end of the options. */ @@ -244,6 +254,9 @@ int main_filter(int argc, char** argv) { verbose = true; write_output = false; break; + case 'T': + output_fields=optarg; + break; case 'E': set_repeat_size = true; repeat_size = parse(optarg); @@ -308,6 +321,12 @@ int main_filter(int argc, char** argv) { } } break; + case 'B': + annotation = optarg; + break; + case 'c': + correctly_mapped = true; + break; case 'U': complement_filter = true; break; @@ -375,6 +394,20 @@ int main_filter(int argc, char** argv) { } filter.verbose = verbose; filter.write_output = write_output; + + + if (!output_fields.empty()){ + //Get the fields for tsv output + filter.write_tsv = true; + filter.write_output = false; + size_t start_i = 0; + for (size_t end_i = 0 ; end_i <= output_fields.size() ; end_i++) { + if (end_i == output_fields.size() || output_fields[end_i] == ';') { + filter.output_fields.emplace_back(output_fields.substr(start_i, end_i-start_i)); + start_i = end_i + 1; + } + } + } if (set_repeat_size) { filter.repeat_size = repeat_size; } @@ -403,6 +436,8 @@ int main_filter(int argc, char** argv) { filter.min_base_quality = min_base_quality; filter.min_base_quality_fraction = min_base_quality_fraction; } + filter.annotation_to_match = annotation; + filter.only_correctly_mapped = correctly_mapped; filter.complement_filter = complement_filter; filter.threads = get_thread_count(); filter.graph = xindex;