From cc2be0a9af300bbd10c5a30968b0dd6d80c08b02 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 11 Jan 2024 19:27:09 +0100 Subject: [PATCH 01/14] Add a filter for annotation matching --- src/readfilter.hpp | 70 +++++++++++++++++++++++++++++++++- src/subcommand/filter_main.cpp | 9 ++++- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 454983a4644..2f2d2510498 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -102,6 +102,11 @@ class ReadFilter{ int min_base_quality = numeric_limits::min() / 2; // minimum fraction of bases in reads that must have quality at least double min_base_quality_fraction = numeric_limits::lowest(); + + /// A string formatted "annotation[.subfield]*:value" + /// Value is optional if the key is a flag + /// Used like jq select + string annotation = ""; /** * Run all the filters on an alignment. The alignment may get modified in-place by the defray filter @@ -243,6 +248,11 @@ class ReadFilter{ * Does the read contain at least one of the indicated sequences */ bool contains_subsequence(const Read& read) const; + + /** + * Does has the given annotation and does it match + */ + bool matches_annotation(const Read& read) const; /** * Write the read to stdout @@ -253,6 +263,7 @@ class ReadFilter{ * Write a read pair to stdout */ void emit(Read& read1, Read& read2); + /// The twp specializations have different writing infrastructure @@ -268,7 +279,7 @@ struct Counts { // note: "last" must be kept as the final value in this enum enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang, min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered, - proper_pair, unmapped, last}; + proper_pair, unmapped, annotation, last}; vector counts; Counts () : counts(FilterName::last, 0) {} Counts& operator+=(const Counts& other) { @@ -536,6 +547,12 @@ Counts ReadFilter::filter_alignment(Read& read) { keep = false; } } + if ((keep || verbose) && !annotation.empty()) { + if (!matches_annotation(read)) { + ++counts.counts[Counts::FilterName::annotation]; + keep = false; + } + } if (!keep) { ++counts.counts[Counts::FilterName::filtered]; @@ -1286,6 +1303,57 @@ bool ReadFilter::contains_subsequence(const Read& read) const { return found; } +template +bool ReadFilter::matches_annotation(const Read& read) const { + + //Walk through the annotation until we find a . or : + size_t start_i = 0; + + //This gets updated as needed + google::protobuf::Struct read_annotation = read.annotation(); + for (size_t end_i = 0 ; end_i < annotation.size() ; end_i++) { + if (annotation[end_i] == '.' || annotation[end_i] == ':') { + if (!read_annotation.fields().count(annotation.substr(start_i, end_i-start_i))) { + //If the annotation doesn't exist + return false; + } else if ( end_i == annotation.size()-1) { + //If the annotation does exist and we only want to check if the read has it + return true; + } else if (annotation[end_i] == ':') { + //If this is the last annotation key and now we want to check the value + google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)); + + if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue && + regex_match(annotation.substr(end_i+1, annotation.size() - end_i - 1), + regex("([0-9]*\\.)?[0-9]*"))) { + //If the value is supposed to be a number + } else if (value_cast(value) == annotation.substr(end_i+1, annotation.size() - end_i - 1)) { + //Otherwise assume that the value is a string + return true; + } else { + return false; + } + } else { + //Otherwise, this is the end of one annotation but it has a subfield next + google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)); + if (value.kind_case() != google::protobuf::Value::KindCase::kStructValue) { + //If this isn't a google::protobuf::Struct, then the input was bad + cerr << "warning: expected another annotation field after " + << annotation.substr(start_i, end_i-start_i) + << " but annotation ends with value " + << value_cast(read_annotation.fields().at(annotation.substr(start_i, end_i-start_i))) << endl; + return false; + } else { + read_annotation = read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)).struct_value(); + } + } + } + } + //If we got through then we must have found the annotations + return true; + +} + template bool ReadFilter::sample_read(const Read& read) const { // Decide if the alignment is paired. diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index 7a233e6986e..0b625ca4940 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -56,6 +56,7 @@ void help_filter(char** argv) { << " -i, --interleaved assume interleaved input. both ends will be filtered out if either fails filter" << endl << " -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters" << endl << " -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl + << " -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the value equal" << endl << " -U, --complement apply the complement of the filter implied by the other arguments." << endl << " -t, --threads N number of threads [1]" << endl; } @@ -105,6 +106,7 @@ int main_filter(int argc, char** argv) { bool complement_filter = false; bool only_proper_pairs = false; bool only_mapped = false; + string annotation = ""; // What XG index, if any, should we load to support the other options? string xg_name; @@ -141,13 +143,14 @@ int main_filter(int argc, char** argv) { {"interleaved", no_argument, 0, 'i'}, {"interleaved-all", no_argument, 0, 'I'}, {"min-base-quality", required_argument, 0, 'b'}, + {"annotation", required_argument, 0, 'B'}, {"complement", no_argument, 0, 'U'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:Ut:", + c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:Ut:", long_options, &option_index); /* Detect the end of the options. */ @@ -308,6 +311,9 @@ int main_filter(int argc, char** argv) { } } break; + case 'B': + annotation = optarg; + break; case 'U': complement_filter = true; break; @@ -403,6 +409,7 @@ int main_filter(int argc, char** argv) { filter.min_base_quality = min_base_quality; filter.min_base_quality_fraction = min_base_quality_fraction; } + filter.annotation = annotation; filter.complement_filter = complement_filter; filter.threads = get_thread_count(); filter.graph = xindex; From 82260ce866856e4b9872201349638230ebcb0388 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 11 Jan 2024 19:48:02 +0100 Subject: [PATCH 02/14] Add a filter for correctly mapped reads --- src/readfilter.cpp | 14 +++++++++++ src/readfilter.hpp | 46 +++++++++++++++++++++++----------- src/subcommand/filter_main.cpp | 11 ++++++-- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/readfilter.cpp b/src/readfilter.cpp index 34d6dffc842..e59d35f94ab 100644 --- a/src/readfilter.cpp +++ b/src/readfilter.cpp @@ -24,6 +24,8 @@ ostream& operator<<(ostream& os, const Counts& counts) { << "Min Quality Filter: " << counts.counts[Counts::FilterName::min_mapq] << endl << "Min Base Quality Filter: " << counts.counts[Counts::FilterName::min_base_qual] << endl << "Random Filter: " << counts.counts[Counts::FilterName::random] << endl + << "Annotation Filter: " << counts.counts[Counts::FilterName::annotation] << endl + << "Incorrectly Mapped Filter: " << counts.counts[Counts::FilterName::incorrectly_mapped] << endl << endl; return os; } @@ -45,4 +47,16 @@ bool ReadFilter::is_mapped(const MultipathAlignment& mp_alig return false; } +template<> +bool ReadFilter::is_correctly_mapped(const Alignment& alignment) const { + return alignment.correctly_mapped(); +} + +//Looks like multipath alignments don't have a field for this +template<> +bool ReadFilter::is_correctly_mapped(const MultipathAlignment& alignment) const { + throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped")); + return true; +} + } diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 2f2d2510498..882d27c222b 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -106,7 +106,10 @@ class ReadFilter{ /// A string formatted "annotation[.subfield]*:value" /// Value is optional if the key is a flag /// Used like jq select - string annotation = ""; + string annotation_to_match = ""; + + /// Filter to only correctly mapped reads + bool only_correctly_mapped = false; /** * Run all the filters on an alignment. The alignment may get modified in-place by the defray filter @@ -254,6 +257,11 @@ class ReadFilter{ */ bool matches_annotation(const Read& read) const; + /** + * Check if the alignment is marked as being correctly mapped + */ + bool is_correctly_mapped(const Read& read) const; + /** * Write the read to stdout */ @@ -279,7 +287,7 @@ struct Counts { // note: "last" must be kept as the final value in this enum enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang, min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered, - proper_pair, unmapped, annotation, last}; + proper_pair, unmapped, annotation, incorrectly_mapped, last}; vector counts; Counts () : counts(FilterName::last, 0) {} Counts& operator+=(const Counts& other) { @@ -547,12 +555,19 @@ Counts ReadFilter::filter_alignment(Read& read) { keep = false; } } - if ((keep || verbose) && !annotation.empty()) { + if ((keep || verbose) && !annotation_to_match.empty()) { if (!matches_annotation(read)) { ++counts.counts[Counts::FilterName::annotation]; keep = false; } } + + if ((keep || verbose) && only_correctly_mapped) { + if (!is_correctly_mapped(read)) { + ++counts.counts[Counts::FilterName::incorrectly_mapped]; + keep = false; + } + } if (!keep) { ++counts.counts[Counts::FilterName::filtered]; @@ -1311,23 +1326,23 @@ bool ReadFilter::matches_annotation(const Read& read) const { //This gets updated as needed google::protobuf::Struct read_annotation = read.annotation(); - for (size_t end_i = 0 ; end_i < annotation.size() ; end_i++) { - if (annotation[end_i] == '.' || annotation[end_i] == ':') { - if (!read_annotation.fields().count(annotation.substr(start_i, end_i-start_i))) { + for (size_t end_i = 0 ; end_i < annotation_to_match.size() ; end_i++) { + if (annotation_to_match[end_i] == '.' || annotation_to_match[end_i] == ':') { + if (!read_annotation.fields().count(annotation_to_match.substr(start_i, end_i-start_i))) { //If the annotation doesn't exist return false; - } else if ( end_i == annotation.size()-1) { + } else if ( end_i == annotation_to_match.size()-1) { //If the annotation does exist and we only want to check if the read has it return true; - } else if (annotation[end_i] == ':') { + } else if (annotation_to_match[end_i] == ':') { //If this is the last annotation key and now we want to check the value - google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)); + google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)); if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue && - regex_match(annotation.substr(end_i+1, annotation.size() - end_i - 1), + regex_match(annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1), regex("([0-9]*\\.)?[0-9]*"))) { //If the value is supposed to be a number - } else if (value_cast(value) == annotation.substr(end_i+1, annotation.size() - end_i - 1)) { + } else if (value_cast(value) == annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1)) { //Otherwise assume that the value is a string return true; } else { @@ -1335,16 +1350,16 @@ bool ReadFilter::matches_annotation(const Read& read) const { } } else { //Otherwise, this is the end of one annotation but it has a subfield next - google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)); + google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)); if (value.kind_case() != google::protobuf::Value::KindCase::kStructValue) { //If this isn't a google::protobuf::Struct, then the input was bad cerr << "warning: expected another annotation field after " - << annotation.substr(start_i, end_i-start_i) + << annotation_to_match.substr(start_i, end_i-start_i) << " but annotation ends with value " - << value_cast(read_annotation.fields().at(annotation.substr(start_i, end_i-start_i))) << endl; + << value_cast(read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i))) << endl; return false; } else { - read_annotation = read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)).struct_value(); + read_annotation = read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)).struct_value(); } } } @@ -1354,6 +1369,7 @@ bool ReadFilter::matches_annotation(const Read& read) const { } + template bool ReadFilter::sample_read(const Read& read) const { // Decide if the alignment is paired. diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index 0b625ca4940..b22c647c940 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -57,6 +57,7 @@ void help_filter(char** argv) { << " -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters" << endl << " -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl << " -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the value equal" << endl + << " -c, --correctly-mapped keep only reads that are marked as correctly-mapped" << endl << " -U, --complement apply the complement of the filter implied by the other arguments." << endl << " -t, --threads N number of threads [1]" << endl; } @@ -107,6 +108,7 @@ int main_filter(int argc, char** argv) { bool only_proper_pairs = false; bool only_mapped = false; string annotation = ""; + bool correctly_mapped = false; // What XG index, if any, should we load to support the other options? string xg_name; @@ -144,13 +146,14 @@ int main_filter(int argc, char** argv) { {"interleaved-all", no_argument, 0, 'I'}, {"min-base-quality", required_argument, 0, 'b'}, {"annotation", required_argument, 0, 'B'}, + {"correctly-mapped", no_argument, 0, 'c'}, {"complement", no_argument, 0, 'U'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:Ut:", + c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:cUt:", long_options, &option_index); /* Detect the end of the options. */ @@ -314,6 +317,9 @@ int main_filter(int argc, char** argv) { case 'B': annotation = optarg; break; + case 'c': + correctly_mapped = true; + break; case 'U': complement_filter = true; break; @@ -409,7 +415,8 @@ int main_filter(int argc, char** argv) { filter.min_base_quality = min_base_quality; filter.min_base_quality_fraction = min_base_quality_fraction; } - filter.annotation = annotation; + filter.annotation_to_match = annotation; + filter.only_correctly_mapped = true; filter.complement_filter = complement_filter; filter.threads = get_thread_count(); filter.graph = xindex; From 146f1895d023e42a8114e04b1ae34c1c10b07ce0 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 12 Jan 2024 14:36:02 +0100 Subject: [PATCH 03/14] Simplify annotation matching and add tsv output to vg filter --- src/readfilter.hpp | 117 +++++++++++++++++++++------------ src/subcommand/filter_main.cpp | 24 ++++++- 2 files changed, 96 insertions(+), 45 deletions(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 882d27c222b..d567a3698e2 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -90,6 +90,10 @@ class ReadFilter{ /// Sometimes we only want a report, and not a filtered gam. toggling off output /// speeds things up considerably. bool write_output = true; + /// Sometimes we want to pick out fields and write a tsv instead of a gam; + bool write_tsv = false; + /// What fields do we want to write to the tsv? + vector output_fields; /// A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this) const HandleGraph* graph = nullptr; /// Interleaved input @@ -272,6 +276,11 @@ class ReadFilter{ */ void emit(Read& read1, Read& read2); + /** + * Write a tsv line for a read to stdout + */ + void emit_tsv(Read& read); + /// The twp specializations have different writing infrastructure @@ -346,7 +355,11 @@ void ReadFilter::filter_internal(istream* in) { Counts read_counts = filter_alignment(read); counts_vec[omp_get_thread_num()] += read_counts; if ((read_counts.keep() != complement_filter) && write_output) { - emit(read); + if (write_tsv) { + emit_tsv(read); + } else { + emit(read); + } } }; @@ -363,9 +376,25 @@ void ReadFilter::filter_internal(istream* in) { } counts_vec[omp_get_thread_num()] += read_counts; if ((read_counts.keep() != complement_filter) && write_output) { - emit(read1, read2); + if (write_tsv) { + emit_tsv(read1); + emit_tsv(read2); + } else { + emit(read1, read2); + } } }; + + if (write_tsv) { + //write a header for the tsv + for (size_t i = 0 ; i < output_fields.size() i++) { + const string& field = output_fields[i]; + cout << field; + if (i == output_fields.size()-1) { + cout << "\t"; + } + } + } if (interleaved) { vg::io::for_each_interleaved_pair_parallel(*in, pair_lambda); @@ -1321,52 +1350,54 @@ bool ReadFilter::contains_subsequence(const Read& read) const { template bool ReadFilter::matches_annotation(const Read& read) const { - //Walk through the annotation until we find a . or : - size_t start_i = 0; - - //This gets updated as needed - google::protobuf::Struct read_annotation = read.annotation(); - for (size_t end_i = 0 ; end_i < annotation_to_match.size() ; end_i++) { - if (annotation_to_match[end_i] == '.' || annotation_to_match[end_i] == ':') { - if (!read_annotation.fields().count(annotation_to_match.substr(start_i, end_i-start_i))) { - //If the annotation doesn't exist - return false; - } else if ( end_i == annotation_to_match.size()-1) { - //If the annotation does exist and we only want to check if the read has it - return true; - } else if (annotation_to_match[end_i] == ':') { - //If this is the last annotation key and now we want to check the value - google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)); + //Assume that there's only one level of annotations + size_t colon_pos = annotation_to_match.find(":"); + if (colon_pos == string::npos) { + //If there was no colon, then just check for the existence of the annotation + return has_annotation(read, annotation_to_match); + } else { + string annotation_key = annotation_to_match.substr(0, colon_pos); + if (!has_annotation(read, annotation_key)) { + return false; + } else { + string annotation_val = annotation_to_match.substr(colon_pos+1, annotation_to_match.size() - colon_pos - 1); + return get_annotation(read, annotation_key) == annotation_val; + } + } + +} - if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue && - regex_match(annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1), - regex("([0-9]*\\.)?[0-9]*"))) { - //If the value is supposed to be a number - } else if (value_cast(value) == annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1)) { - //Otherwise assume that the value is a string - return true; - } else { - return false; - } +template +void ReadFilter::emit_tsv(Read& read) { + cout << endl; + for (size_t i = 0 ; i < output_fields.size() ; i++) { + const string& field = output_fields[i]; + if (field == "name") { + cout << read.name(); + } else if (field == "correctly_mapped") { + if (is_correctly_mapped(read)) { + cout << "True"; } else { - //Otherwise, this is the end of one annotation but it has a subfield next - google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)); - if (value.kind_case() != google::protobuf::Value::KindCase::kStructValue) { - //If this isn't a google::protobuf::Struct, then the input was bad - cerr << "warning: expected another annotation field after " - << annotation_to_match.substr(start_i, end_i-start_i) - << " but annotation ends with value " - << value_cast(read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i))) << endl; - return false; - } else { - read_annotation = read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)).struct_value(); - } + cout << "False"; + } + } else if (field == "mapping_quality") { + cout << get_mapq(read); + } else if (field == "annotation") { + throw runtime_error("error: Cannot write all annotations"); + } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") { + if (!has_annotation(read, field.substr(11, field.size()-11))) { + throw runtime_error("error: Cannot find annotation "+ field); + } else { + cout << get_annotation(read, field.substr(11, field.size()-11)); } + } else { + cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl; + throw runtime_error("error: Writing non-existent field to tsv: " + field); + } + if (i != output_fields.size()-1) { + cout << "\t"; } } - //If we got through then we must have found the annotations - return true; - } diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index b22c647c940..dae80d2c8e1 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -48,6 +48,7 @@ void help_filter(char** argv) { << " -x, --xg-name FILE use this xg index or graph (required for -S and -D)" << endl << " -v, --verbose print out statistics on numbers of reads filtered by what." << endl << " -V, --no-output print out statistics (as above) but do not write out filtered GAM." << endl + << " -T, --tsv-out FIELD[;FIELD] do not write filtered gam but a tsv of the given fields" << endl << " -q, --min-mapq N filter alignments with mapping quality < N" << endl << " -E, --repeat-ends N filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end" << endl << " -D, --defray-ends N clip back the ends of reads that are ambiguously aligned, up to N bases" << endl @@ -56,7 +57,8 @@ void help_filter(char** argv) { << " -i, --interleaved assume interleaved input. both ends will be filtered out if either fails filter" << endl << " -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters" << endl << " -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl - << " -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the value equal" << endl + << " -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the values are equal" << endl + << " similar to running jq 'select(.annotation.K==V)' on the json" << endl << " -c, --correctly-mapped keep only reads that are marked as correctly-mapped" << endl << " -U, --complement apply the complement of the filter implied by the other arguments." << endl << " -t, --threads N number of threads [1]" << endl; @@ -108,6 +110,7 @@ int main_filter(int argc, char** argv) { bool only_proper_pairs = false; bool only_mapped = false; string annotation = ""; + string output_fields = ""; bool correctly_mapped = false; // What XG index, if any, should we load to support the other options? @@ -137,6 +140,7 @@ int main_filter(int argc, char** argv) { {"drop-split", no_argument, 0, 'S'}, {"xg-name", required_argument, 0, 'x'}, {"verbose", no_argument, 0, 'v'}, + {"tsv-out", no_argument, 0, 'T'}, {"min-mapq", required_argument, 0, 'q'}, {"repeat-ends", required_argument, 0, 'E'}, {"defray-ends", required_argument, 0, 'D'}, @@ -153,7 +157,7 @@ int main_filter(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:cUt:", + c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:", long_options, &option_index); /* Detect the end of the options. */ @@ -250,6 +254,9 @@ int main_filter(int argc, char** argv) { verbose = true; write_output = false; break; + case 'T': + output_fields=optarg; + break; case 'E': set_repeat_size = true; repeat_size = parse(optarg); @@ -387,6 +394,19 @@ int main_filter(int argc, char** argv) { } filter.verbose = verbose; filter.write_output = write_output; + + + if (!output_fields.empty()){ + //Get the fields for tsv output + filter.write_tsv = true; + size_t start_i = 0; + for (size_t end_i = 0 ; end_i <= output_fields.size() ; end_i++) { + if (end_i == output_fields.size() || output_fields[end_i] == ';') { + filter.output_fields.emplace_back(output_fields.substr(start_i, end_i-start_i)); + start_i = end_i + 1; + } + } + } if (set_repeat_size) { filter.repeat_size = repeat_size; } From e84b0b5d24f6f29973e5d530c3053cdefa8ddbfb Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 12 Jan 2024 15:00:41 +0100 Subject: [PATCH 04/14] Fix typo --- src/readfilter.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index d567a3698e2..7b72da5c708 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -387,7 +387,7 @@ void ReadFilter::filter_internal(istream* in) { if (write_tsv) { //write a header for the tsv - for (size_t i = 0 ; i < output_fields.size() i++) { + for (size_t i = 0 ; i < output_fields.size() ; i++) { const string& field = output_fields[i]; cout << field; if (i == output_fields.size()-1) { From fc0f90fabaabf866ec5ee984503f5fb39322f9a2 Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 22 Jan 2024 17:03:43 +0100 Subject: [PATCH 05/14] Write tsv header properly --- src/readfilter.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 7b72da5c708..5309af0ed4e 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -390,7 +390,7 @@ void ReadFilter::filter_internal(istream* in) { for (size_t i = 0 ; i < output_fields.size() ; i++) { const string& field = output_fields[i]; cout << field; - if (i == output_fields.size()-1) { + if (i != output_fields.size()-1) { cout << "\t"; } } From a8d1eaf50aa54fd1994856beebd43a0d0e3877f3 Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 24 Jan 2024 14:33:28 +0100 Subject: [PATCH 06/14] Try to make tsv output parallel --- src/readfilter.hpp | 53 +++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 5309af0ed4e..5152750e2bd 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -1369,34 +1369,39 @@ bool ReadFilter::matches_annotation(const Read& read) const { template void ReadFilter::emit_tsv(Read& read) { - cout << endl; - for (size_t i = 0 ; i < output_fields.size() ; i++) { - const string& field = output_fields[i]; - if (field == "name") { - cout << read.name(); - } else if (field == "correctly_mapped") { - if (is_correctly_mapped(read)) { - cout << "True"; +#pragma omp critical (cout) + { + + cout << endl; + for (size_t i = 0 ; i < output_fields.size() ; i++) { + const string& field = output_fields[i]; + if (field == "name") { + cout << read.name(); + } else if (field == "correctly_mapped") { + if (is_correctly_mapped(read)) { + cout << "True"; + } else { + cout << "False"; + } + } else if (field == "mapping_quality") { + cout << get_mapq(read); + } else if (field == "annotation") { + throw runtime_error("error: Cannot write all annotations"); + } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") { + if (!has_annotation(read, field.substr(11, field.size()-11))) { + throw runtime_error("error: Cannot find annotation "+ field); + } else { + cout << get_annotation(read, field.substr(11, field.size()-11)); + } } else { - cout << "False"; + cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl; + throw runtime_error("error: Writing non-existent field to tsv: " + field); } - } else if (field == "mapping_quality") { - cout << get_mapq(read); - } else if (field == "annotation") { - throw runtime_error("error: Cannot write all annotations"); - } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") { - if (!has_annotation(read, field.substr(11, field.size()-11))) { - throw runtime_error("error: Cannot find annotation "+ field); - } else { - cout << get_annotation(read, field.substr(11, field.size()-11)); + if (i != output_fields.size()-1) { + cout << "\t"; } - } else { - cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl; - throw runtime_error("error: Writing non-existent field to tsv: " + field); - } - if (i != output_fields.size()-1) { - cout << "\t"; } + } } From 4b74f91fb85f87ddd7148452212b0b7cecfc8fdf Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 24 Jan 2024 15:07:18 +0100 Subject: [PATCH 07/14] Turn off correctly mapped filter --- src/subcommand/filter_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index dae80d2c8e1..b54642fabfc 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -436,7 +436,7 @@ int main_filter(int argc, char** argv) { filter.min_base_quality_fraction = min_base_quality_fraction; } filter.annotation_to_match = annotation; - filter.only_correctly_mapped = true; + filter.only_correctly_mapped = correctly_mapped; filter.complement_filter = complement_filter; filter.threads = get_thread_count(); filter.graph = xindex; From 7a8c38cd5f46fc0c0ae410e2facc3204cab1ace5 Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 24 Jan 2024 17:54:07 +0100 Subject: [PATCH 08/14] Don't make a alignment emitter when writing tsv --- src/subcommand/filter_main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index b54642fabfc..af1c5678cbb 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -399,6 +399,7 @@ int main_filter(int argc, char** argv) { if (!output_fields.empty()){ //Get the fields for tsv output filter.write_tsv = true; + filter.write_output = false; size_t start_i = 0; for (size_t end_i = 0 ; end_i <= output_fields.size() ; end_i++) { if (end_i == output_fields.size() || output_fields[end_i] == ';') { From 837fa357c1847878413c760d55cca188152b5f42 Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 24 Jan 2024 18:00:55 +0100 Subject: [PATCH 09/14] Write tsv output --- src/readfilter.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 5152750e2bd..bc366387e3e 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -354,7 +354,7 @@ void ReadFilter::filter_internal(istream* in) { #endif Counts read_counts = filter_alignment(read); counts_vec[omp_get_thread_num()] += read_counts; - if ((read_counts.keep() != complement_filter) && write_output) { + if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) { if (write_tsv) { emit_tsv(read); } else { @@ -375,7 +375,7 @@ void ReadFilter::filter_internal(istream* in) { read_counts.set_paired_any(); } counts_vec[omp_get_thread_num()] += read_counts; - if ((read_counts.keep() != complement_filter) && write_output) { + if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) { if (write_tsv) { emit_tsv(read1); emit_tsv(read2); From 951db07cfa62df9cdbb372af7721db0453697cdd Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 24 Jan 2024 22:33:50 +0100 Subject: [PATCH 10/14] Try getting annotations with different types --- src/readfilter.hpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index bc366387e3e..f4c9fd2fca7 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -1360,8 +1360,16 @@ bool ReadFilter::matches_annotation(const Read& read) const { if (!has_annotation(read, annotation_key)) { return false; } else { + google::protobuf::Value value = read.annotation().fields().at(annotation_key); string annotation_val = annotation_to_match.substr(colon_pos+1, annotation_to_match.size() - colon_pos - 1); - return get_annotation(read, annotation_key) == annotation_val; + if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) { + double read_value = get_annotation(read, annotation_key); + return read_value == std::stod(annotation_val); + } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) { + return get_annotation(read, annotation_key) == annotation_val; + } else { + throw runtime_error("error: Cannot check equality of annotation " + annotation_to_match); + } } } @@ -1391,7 +1399,16 @@ void ReadFilter::emit_tsv(Read& read) { if (!has_annotation(read, field.substr(11, field.size()-11))) { throw runtime_error("error: Cannot find annotation "+ field); } else { - cout << get_annotation(read, field.substr(11, field.size()-11)); + string annotation_key = field.substr(11, field.size()-11); + google::protobuf::Value value = read.annotation().fields().at(annotation_key); + + if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) { + cout << get_annotation(read, annotation_key); + } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) { + cout << get_annotation(read, annotation_key); + } else { + cout << "?"; + } } } else { cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl; From 9a0a835553f534379686c2e01b52c6669b837a58 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Mon, 26 Feb 2024 13:59:57 -0800 Subject: [PATCH 11/14] Add sequence to tsv output --- src/readfilter.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index f4c9fd2fca7..43aab1fed76 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -387,6 +387,7 @@ void ReadFilter::filter_internal(istream* in) { if (write_tsv) { //write a header for the tsv + cout << "#"; for (size_t i = 0 ; i < output_fields.size() ; i++) { const string& field = output_fields[i]; cout << field; @@ -1393,6 +1394,8 @@ void ReadFilter::emit_tsv(Read& read) { } } else if (field == "mapping_quality") { cout << get_mapq(read); + } else if (field == "sequence") { + cout << read.sequence(); } else if (field == "annotation") { throw runtime_error("error: Cannot write all annotations"); } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") { From 1e600baab0e16f71f3b4e76d88544eeab99f19d1 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 29 Feb 2024 21:06:44 +0100 Subject: [PATCH 12/14] Add time_used --- src/readfilter.cpp | 11 ----------- src/readfilter.hpp | 22 ++++++++++++++++++++-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/readfilter.cpp b/src/readfilter.cpp index e59d35f94ab..f24e84ce127 100644 --- a/src/readfilter.cpp +++ b/src/readfilter.cpp @@ -47,16 +47,5 @@ bool ReadFilter::is_mapped(const MultipathAlignment& mp_alig return false; } -template<> -bool ReadFilter::is_correctly_mapped(const Alignment& alignment) const { - return alignment.correctly_mapped(); -} - -//Looks like multipath alignments don't have a field for this -template<> -bool ReadFilter::is_correctly_mapped(const MultipathAlignment& alignment) const { - throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped")); - return true; -} } diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 43aab1fed76..7876cfb5479 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -451,6 +451,18 @@ inline int ReadFilter::filter(istream* alignment_stream) { return 0; } +template<> +inline bool ReadFilter::is_correctly_mapped(const Alignment& alignment) const { + return alignment.correctly_mapped(); +} + +//Looks like multipath alignments don't have a field for this +template<> +inline bool ReadFilter::is_correctly_mapped(const MultipathAlignment& alignment) const { + throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped")); + return true; +} + template<> inline void ReadFilter::emit(Alignment& aln) { @@ -1376,8 +1388,12 @@ bool ReadFilter::matches_annotation(const Read& read) const { } -template -void ReadFilter::emit_tsv(Read& read) { +template<> +inline void ReadFilter::emit_tsv(MultipathAlignment& read) { + return; +} +template<> +inline void ReadFilter::emit_tsv(Alignment& read) { #pragma omp critical (cout) { @@ -1396,6 +1412,8 @@ void ReadFilter::emit_tsv(Read& read) { cout << get_mapq(read); } else if (field == "sequence") { cout << read.sequence(); + } else if (field == "time_used") { + cout << read.time_used(); } else if (field == "annotation") { throw runtime_error("error: Cannot write all annotations"); } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") { From 3de54147056064e30132a251067e975cb1fcc021 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 1 Mar 2024 14:22:28 +0100 Subject: [PATCH 13/14] Add correctness tsv output that also checks off-reference --- src/readfilter.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 7876cfb5479..71e01590e52 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -1408,6 +1408,14 @@ inline void ReadFilter::emit_tsv(Alignment& read) { } else { cout << "False"; } + } else if (field == "correctness") { + if (is_correctly_mapped(read)) { + cout << "correct"; + } else if (has_annotation(read, "no_truth") && get_annotation(read, "no_truth")) { + cout << "off-reference"; + } else { + cout << "incorrect"; + } } else if (field == "mapping_quality") { cout << get_mapq(read); } else if (field == "sequence") { From 69c55b68d8b817f8eafdfa05ffd790d09f0a6c17 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 1 Mar 2024 14:35:55 +0100 Subject: [PATCH 14/14] Add check for boolean annotations --- src/readfilter.hpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 71e01590e52..7c0fab58daa 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -1367,7 +1367,16 @@ bool ReadFilter::matches_annotation(const Read& read) const { size_t colon_pos = annotation_to_match.find(":"); if (colon_pos == string::npos) { //If there was no colon, then just check for the existence of the annotation - return has_annotation(read, annotation_to_match); + // or, if it is a boolean value, check that it's true + if (!has_annotation(read, annotation_to_match)) { + return false; + } + google::protobuf::Value value = read.annotation().fields().at(annotation_to_match); + if (value.kind_case() == google::protobuf::Value::KindCase::kBoolValue) { + return get_annotation(read, annotation_to_match); + } else { + return true; + } } else { string annotation_key = annotation_to_match.substr(0, colon_pos); if (!has_annotation(read, annotation_key)) {