vgteam · xchang1 · Mar 2, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 12, 2024
diff --git a/src/readfilter.cpp b/src/readfilter.cpp
@@ -24,6 +24,8 @@ ostream& operator<<(ostream& os, const Counts& counts) {
        << "Min Quality Filter:            " << counts.counts[Counts::FilterName::min_mapq] << endl
        << "Min Base Quality Filter:       " << counts.counts[Counts::FilterName::min_base_qual] << endl
        << "Random Filter:                 " << counts.counts[Counts::FilterName::random] << endl
+       << "Annotation Filter:             " << counts.counts[Counts::FilterName::annotation] << endl
+       << "Incorrectly Mapped Filter:     " << counts.counts[Counts::FilterName::incorrectly_mapped] << endl
        << endl;
     return os;
 }
@@ -45,4 +47,5 @@ bool ReadFilter<MultipathAlignment>::is_mapped(const MultipathAlignment& mp_alig
     return false;
 }
 
+
 }
diff --git a/src/readfilter.hpp b/src/readfilter.hpp
@@ -90,6 +90,10 @@ class ReadFilter{
     /// Sometimes we only want a report, and not a filtered gam.  toggling off output
     /// speeds things up considerably.
     bool write_output = true;
+    /// Sometimes we want to pick out fields and write a tsv instead of a gam;
+    bool write_tsv = false;
+    /// What fields do we want to write to the tsv?
+    vector<string> output_fields;
     /// A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this)
     const HandleGraph* graph = nullptr;
     /// Interleaved input
@@ -102,6 +106,14 @@ class ReadFilter{
     int min_base_quality = numeric_limits<int>::min() / 2;
     // minimum fraction of bases in reads that must have quality at least <min_base_quality>
     double min_base_quality_fraction = numeric_limits<double>::lowest();
+
+    /// A string formatted "annotation[.subfield]*:value"
+    /// Value is optional if the key is a flag
+    /// Used like jq select
+    string annotation_to_match = "";
+
+    /// Filter to only correctly mapped reads
+    bool only_correctly_mapped = false;
 
     /**
      * Run all the filters on an alignment. The alignment may get modified in-place by the defray filter
@@ -243,7 +255,17 @@ class ReadFilter{
      * Does the read contain at least one of the indicated sequences
      */
     bool contains_subsequence(const Read& read) const;
+
+    /** 
+     * Does has the given annotation and does it match
+     */
+     bool matches_annotation(const Read& read) const;
 
+    /**
+     * Check if the alignment is marked as being correctly mapped
+     */
+    bool is_correctly_mapped(const Read& read) const;
+
     /**
      * Write the read to stdout
      */
@@ -253,6 +275,12 @@ class ReadFilter{
      * Write a read pair to stdout
      */
     void emit(Read& read1, Read& read2);
+
+    /**
+     * Write a tsv line for a read to stdout
+     */
+     void emit_tsv(Read& read);
+
 
 
     /// The twp specializations have different writing infrastructure
@@ -268,7 +296,7 @@ struct Counts {
     // note: "last" must be kept as the final value in this enum
     enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang,
         min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered,
-        proper_pair, unmapped, last};
+        proper_pair, unmapped, annotation, incorrectly_mapped, last};
     vector<size_t> counts;
     Counts () : counts(FilterName::last, 0) {}
     Counts& operator+=(const Counts& other) {
@@ -326,8 +354,12 @@ void ReadFilter<Read>::filter_internal(istream* in) {
 #endif
         Counts read_counts = filter_alignment(read);
         counts_vec[omp_get_thread_num()] += read_counts;
-        if ((read_counts.keep() != complement_filter) && write_output) {
-            emit(read);
+        if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) {
+            if (write_tsv) {
+                emit_tsv(read);
+            } else {
+                emit(read);
+            }
         }
     };
 
@@ -343,10 +375,27 @@ void ReadFilter<Read>::filter_internal(istream* in) {
             read_counts.set_paired_any();
         }
         counts_vec[omp_get_thread_num()] += read_counts;
-        if ((read_counts.keep() != complement_filter) && write_output) {
-            emit(read1, read2);
+        if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) {
+            if (write_tsv) {
+                emit_tsv(read1);
+                emit_tsv(read2);
+            } else {
+                emit(read1, read2);
+            }
         }
     };
+
+    if (write_tsv) {
+        //write a header for the tsv
+        cout << "#";
+        for (size_t i = 0 ; i < output_fields.size() ; i++) {
+            const string& field = output_fields[i];
+            cout << field;
+            if (i != output_fields.size()-1) {
+                cout << "\t";
+            }
+        }
+    }
 
     if (interleaved) {
         vg::io::for_each_interleaved_pair_parallel(*in, pair_lambda);
@@ -402,6 +451,18 @@ inline int ReadFilter<MultipathAlignment>::filter(istream* alignment_stream) {
 
     return 0;
 }
+template<>
+inline bool ReadFilter<Alignment>::is_correctly_mapped(const Alignment& alignment) const {
+    return alignment.correctly_mapped();
+}
+
+//Looks like multipath alignments don't have a field for this
+template<>
+inline bool ReadFilter<MultipathAlignment>::is_correctly_mapped(const MultipathAlignment& alignment) const {
+    throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped"));
+    return true;
+}
+
 
 template<>
 inline void ReadFilter<Alignment>::emit(Alignment& aln) {
@@ -536,6 +597,19 @@ Counts ReadFilter<Read>::filter_alignment(Read& read) {
             keep = false;
         }
     }
+    if ((keep || verbose) && !annotation_to_match.empty()) {
+        if (!matches_annotation(read)) {
+            ++counts.counts[Counts::FilterName::annotation];
+            keep = false;
+        }
+    }
+
+    if ((keep || verbose) && only_correctly_mapped) {
+        if (!is_correctly_mapped(read)) {
+            ++counts.counts[Counts::FilterName::incorrectly_mapped];
+            keep = false;
+        }
+    }
 
     if (!keep) {
         ++counts.counts[Counts::FilterName::filtered];
@@ -1286,6 +1360,107 @@ bool ReadFilter<Read>::contains_subsequence(const Read& read) const {
     return found;
 }
 
+template<typename Read>
+bool ReadFilter<Read>::matches_annotation(const Read& read) const {
+
+    //Assume that there's only one level of annotations
+    size_t colon_pos = annotation_to_match.find(":");
+    if (colon_pos == string::npos) {
+        //If there was no colon, then just check for the existence of the annotation
+        // or, if it is a boolean value, check that it's true
+        if (!has_annotation(read, annotation_to_match)) {
+            return false;
+        }
+        google::protobuf::Value value = read.annotation().fields().at(annotation_to_match);
+        if (value.kind_case() == google::protobuf::Value::KindCase::kBoolValue) {
+            return get_annotation<bool>(read, annotation_to_match);
+        } else {
+            return true;
+        }
+    } else {
+        string annotation_key = annotation_to_match.substr(0, colon_pos);
+        if (!has_annotation(read, annotation_key)) {
+            return false;
+        } else {
+            google::protobuf::Value value = read.annotation().fields().at(annotation_key);
+            string annotation_val = annotation_to_match.substr(colon_pos+1, annotation_to_match.size() - colon_pos - 1);
+            if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) {
+                double read_value = get_annotation<double>(read, annotation_key);
+                return read_value == std::stod(annotation_val);
+            } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) {
+                return get_annotation<string>(read, annotation_key) == annotation_val; 
+            } else {
+                throw runtime_error("error: Cannot check equality of annotation " + annotation_to_match);
+            }
+        }
+    }
+
+}
+
+template<>
+inline void ReadFilter<MultipathAlignment>::emit_tsv(MultipathAlignment& read) {
+    return;
+}
+template<>
+inline void ReadFilter<Alignment>::emit_tsv(Alignment& read) {
+#pragma omp critical (cout)
+    {
+
+        cout << endl;
+        for (size_t i = 0 ; i < output_fields.size() ; i++) {
+            const string& field = output_fields[i];
+            if (field == "name") {
+                cout << read.name();
+            } else if (field == "correctly_mapped") {
+                if (is_correctly_mapped(read)) {
+                    cout << "True";
+                } else {
+                    cout << "False";
+                }
+            } else if (field == "correctness") {
+                if (is_correctly_mapped(read)) {
+                    cout << "correct";
+                } else if (has_annotation(read, "no_truth") && get_annotation<bool>(read, "no_truth")) {
+                    cout << "off-reference";
+                } else {
+                    cout << "incorrect";
+                }
+            } else if (field == "mapping_quality") {
+                cout << get_mapq(read); 
+            } else if (field == "sequence") {
+                cout << read.sequence(); 
+            } else if (field == "time_used") {
+                cout << read.time_used();
+            } else if (field == "annotation") {
+                throw runtime_error("error: Cannot write all annotations");
+            } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") {
+                if (!has_annotation(read, field.substr(11, field.size()-11))) {
+                    throw runtime_error("error: Cannot find annotation "+ field);
+                } else {
+                    string annotation_key = field.substr(11, field.size()-11);
+                    google::protobuf::Value value = read.annotation().fields().at(annotation_key);
+
+                    if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) {
+                        cout << get_annotation<double>(read, annotation_key);
+                    } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) {
+                        cout << get_annotation<string>(read, annotation_key);
+                    } else {
+                        cout << "?";
+                    }
+                }
+            } else {
+                cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl;
+                throw runtime_error("error: Writing non-existent field to tsv: " + field);
+            }
+            if (i != output_fields.size()-1) {
+                cout << "\t";
+            }
+        }
+
+    }
+}
+
+
 template<typename Read>
 bool ReadFilter<Read>::sample_read(const Read& read) const {
     // Decide if the alignment is paired.

diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp
@@ -48,6 +48,7 @@ void help_filter(char** argv) {
          << "    -x, --xg-name FILE         use this xg index or graph (required for -S and -D)" << endl
          << "    -v, --verbose              print out statistics on numbers of reads filtered by what." << endl
          << "    -V, --no-output            print out statistics (as above) but do not write out filtered GAM." << endl
+         << "    -T, --tsv-out FIELD[;FIELD] do not write filtered gam but a tsv of the given fields" << endl
          << "    -q, --min-mapq N           filter alignments with mapping quality < N" << endl
          << "    -E, --repeat-ends N        filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end" << endl
          << "    -D, --defray-ends N        clip back the ends of reads that are ambiguously aligned, up to N bases" << endl
@@ -56,6 +57,9 @@ void help_filter(char** argv) {
          << "    -i, --interleaved          assume interleaved input. both ends will be filtered out if either fails filter" << endl
          << "    -I, --interleaved-all      assume interleaved input. both ends will be filtered out if *both* fail filters" << endl
          << "    -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl
+         << "    -B, --annotation K[:V]     keep reads if the annotation is present. If a value is given, keep reads if the values are equal" << endl
+         << "                               similar to running jq 'select(.annotation.K==V)' on the json" << endl 
+         << "    -c, --correctly-mapped     keep only reads that are marked as correctly-mapped" << endl
          << "    -U, --complement           apply the complement of the filter implied by the other arguments." << endl
          << "    -t, --threads N            number of threads [1]" << endl;
 }
@@ -105,6 +109,9 @@ int main_filter(int argc, char** argv) {
     bool complement_filter = false;
     bool only_proper_pairs = false;
     bool only_mapped = false;
+    string annotation = "";
+    string output_fields = "";
+    bool correctly_mapped = false;
 
     // What XG index, if any, should we load to support the other options?
     string xg_name;
@@ -133,6 +140,7 @@ int main_filter(int argc, char** argv) {
                 {"drop-split",  no_argument, 0, 'S'},
                 {"xg-name", required_argument, 0, 'x'},
                 {"verbose",  no_argument, 0, 'v'},
+                {"tsv-out",  no_argument, 0, 'T'},
                 {"min-mapq", required_argument, 0, 'q'},
                 {"repeat-ends", required_argument, 0, 'E'},
                 {"defray-ends", required_argument, 0, 'D'},
@@ -141,13 +149,15 @@ int main_filter(int argc, char** argv) {
                 {"interleaved", no_argument, 0, 'i'},
                 {"interleaved-all", no_argument, 0, 'I'},
                 {"min-base-quality", required_argument, 0, 'b'},
+                {"annotation", required_argument, 0, 'B'},
+                {"correctly-mapped", no_argument, 0, 'c'},
                 {"complement", no_argument, 0, 'U'},
                 {"threads", required_argument, 0, 't'},
                 {0, 0, 0, 0}
             };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:Ut:",
+        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:",
                          long_options, &option_index);
 
         /* Detect the end of the options. */
@@ -244,6 +254,9 @@ int main_filter(int argc, char** argv) {
             verbose = true;
             write_output = false;
             break;
+        case 'T':
+            output_fields=optarg;
+            break;
         case 'E':
             set_repeat_size = true;
             repeat_size = parse<int>(optarg);
@@ -308,6 +321,12 @@ int main_filter(int argc, char** argv) {
                 }
             }
             break;
+        case 'B':
+            annotation = optarg;
+            break;
+        case 'c':
+            correctly_mapped = true;
+            break;
         case 'U':
             complement_filter = true;
             break;
@@ -375,6 +394,20 @@ int main_filter(int argc, char** argv) {
         }
         filter.verbose = verbose;
         filter.write_output = write_output;
+
+
+        if (!output_fields.empty()){
+            //Get the fields for tsv output
+            filter.write_tsv = true;
+            filter.write_output = false;
+            size_t start_i = 0;
+            for (size_t end_i = 0 ; end_i <= output_fields.size() ; end_i++) {
+                if (end_i == output_fields.size() || output_fields[end_i] == ';') {
+                    filter.output_fields.emplace_back(output_fields.substr(start_i, end_i-start_i));
+                    start_i = end_i + 1;
+                }
+            }
+        }
         if (set_repeat_size) {
             filter.repeat_size = repeat_size;
         }
@@ -403,6 +436,8 @@ int main_filter(int argc, char** argv) {
             filter.min_base_quality = min_base_quality;
             filter.min_base_quality_fraction = min_base_quality_fraction;
         }
+        filter.annotation_to_match = annotation;
+        filter.only_correctly_mapped = correctly_mapped;
         filter.complement_filter = complement_filter;
         filter.threads = get_thread_count();
         filter.graph = xindex;