From cc2be0a9af300bbd10c5a30968b0dd6d80c08b02 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Thu, 11 Jan 2024 19:27:09 +0100
Subject: [PATCH 01/14] Add a filter for annotation matching

---
 src/readfilter.hpp             | 70 +++++++++++++++++++++++++++++++++-
 src/subcommand/filter_main.cpp |  9 ++++-
 2 files changed, 77 insertions(+), 2 deletions(-)
diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 454983a4644..2f2d2510498 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -102,6 +102,11 @@ class ReadFilter{
     int min_base_quality = numeric_limits<int>::min() / 2;
     // minimum fraction of bases in reads that must have quality at least <min_base_quality>
     double min_base_quality_fraction = numeric_limits<double>::lowest();
+
+    /// A string formatted "annotation[.subfield]*:value"
+    /// Value is optional if the key is a flag
+    /// Used like jq select
+    string annotation = "";
       
     /**
      * Run all the filters on an alignment. The alignment may get modified in-place by the defray filter
@@ -243,6 +248,11 @@ class ReadFilter{
      * Does the read contain at least one of the indicated sequences
      */
     bool contains_subsequence(const Read& read) const;
+
+    /** 
+     * Does has the given annotation and does it match
+     */
+     bool matches_annotation(const Read& read) const;
     
     /**
      * Write the read to stdout
@@ -253,6 +263,7 @@ class ReadFilter{
      * Write a read pair to stdout
      */
     void emit(Read& read1, Read& read2);
+
     
     
     /// The twp specializations have different writing infrastructure
@@ -268,7 +279,7 @@ struct Counts {
     // note: "last" must be kept as the final value in this enum
     enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang,
         min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered,
-        proper_pair, unmapped, last};
+        proper_pair, unmapped, annotation, last};
     vector<size_t> counts;
     Counts () : counts(FilterName::last, 0) {}
     Counts& operator+=(const Counts& other) {
@@ -536,6 +547,12 @@ Counts ReadFilter<Read>::filter_alignment(Read& read) {
             keep = false;
         }
     }
+    if ((keep || verbose) && !annotation.empty()) {
+        if (!matches_annotation(read)) {
+            ++counts.counts[Counts::FilterName::annotation];
+            keep = false;
+        }
+    }
     
     if (!keep) {
         ++counts.counts[Counts::FilterName::filtered];
@@ -1286,6 +1303,57 @@ bool ReadFilter<Read>::contains_subsequence(const Read& read) const {
     return found;
 }
 
+template<typename Read>
+bool ReadFilter<Read>::matches_annotation(const Read& read) const {
+
+    //Walk through the annotation until we find a . or :
+    size_t start_i = 0;
+
+    //This gets updated as needed
+    google::protobuf::Struct read_annotation = read.annotation();
+    for (size_t end_i = 0 ; end_i < annotation.size() ; end_i++) {
+        if (annotation[end_i] == '.' || annotation[end_i] == ':') {
+            if (!read_annotation.fields().count(annotation.substr(start_i, end_i-start_i))) {
+                //If the annotation doesn't exist 
+                return false;
+            } else if ( end_i == annotation.size()-1) {
+                //If the annotation does exist and we only want to check if the read has it
+                return true;
+            } else if (annotation[end_i] == ':') {
+                //If this is the last annotation key and now we want to check the value
+                google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i));
+
+                if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue &&
+                    regex_match(annotation.substr(end_i+1, annotation.size() - end_i - 1),
+                                regex("([0-9]*\\.)?[0-9]*"))) {
+                    //If the value is supposed to be a number
+                } else if (value_cast<string>(value) == annotation.substr(end_i+1, annotation.size() - end_i - 1)) {
+                    //Otherwise assume that the value is a string
+                    return true;
+                } else {
+                    return false;
+                }
+            } else {
+                //Otherwise, this is the end of one annotation but it has a subfield next
+                google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i));
+                if (value.kind_case() != google::protobuf::Value::KindCase::kStructValue) {
+                    //If this isn't a google::protobuf::Struct, then the input was bad
+                    cerr << "warning: expected another annotation field after " 
+                         << annotation.substr(start_i, end_i-start_i) 
+                         << " but annotation ends with value " 
+                         << value_cast<string>(read_annotation.fields().at(annotation.substr(start_i, end_i-start_i))) << endl;
+                    return false;
+                } else {
+                    read_annotation = read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)).struct_value();
+                }
+            }
+        }
+    }
+    //If we got through then we must have found the annotations
+    return true;
+    
+}
+
 template<typename Read>
 bool ReadFilter<Read>::sample_read(const Read& read) const {
     // Decide if the alignment is paired.
diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp
index 7a233e6986e..0b625ca4940 100644
--- a/src/subcommand/filter_main.cpp
+++ b/src/subcommand/filter_main.cpp
@@ -56,6 +56,7 @@ void help_filter(char** argv) {
          << "    -i, --interleaved          assume interleaved input. both ends will be filtered out if either fails filter" << endl
          << "    -I, --interleaved-all      assume interleaved input. both ends will be filtered out if *both* fail filters" << endl
          << "    -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl
+         << "    -B, --annotation K[:V]     keep reads if the annotation is present. If a value is given, keep reads if the value equal" << endl
          << "    -U, --complement           apply the complement of the filter implied by the other arguments." << endl
          << "    -t, --threads N            number of threads [1]" << endl;
 }
@@ -105,6 +106,7 @@ int main_filter(int argc, char** argv) {
     bool complement_filter = false;
     bool only_proper_pairs = false;
     bool only_mapped = false;
+    string annotation = "";
 
     // What XG index, if any, should we load to support the other options?
     string xg_name;
@@ -141,13 +143,14 @@ int main_filter(int argc, char** argv) {
                 {"interleaved", no_argument, 0, 'i'},
                 {"interleaved-all", no_argument, 0, 'I'},
                 {"min-base-quality", required_argument, 0, 'b'},
+                {"annotation", required_argument, 0, 'B'},
                 {"complement", no_argument, 0, 'U'},
                 {"threads", required_argument, 0, 't'},
                 {0, 0, 0, 0}
             };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:Ut:",
+        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:Ut:",
                          long_options, &option_index);
 
         /* Detect the end of the options. */
@@ -308,6 +311,9 @@ int main_filter(int argc, char** argv) {
                 }
             }
             break;
+        case 'B':
+            annotation = optarg;
+            break;
         case 'U':
             complement_filter = true;
             break;
@@ -403,6 +409,7 @@ int main_filter(int argc, char** argv) {
             filter.min_base_quality = min_base_quality;
             filter.min_base_quality_fraction = min_base_quality_fraction;
         }
+        filter.annotation = annotation;
         filter.complement_filter = complement_filter;
         filter.threads = get_thread_count();
         filter.graph = xindex;

From 82260ce866856e4b9872201349638230ebcb0388 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Thu, 11 Jan 2024 19:48:02 +0100
Subject: [PATCH 02/14] Add a filter for correctly mapped reads

---
 src/readfilter.cpp             | 14 +++++++++++
 src/readfilter.hpp             | 46 +++++++++++++++++++++++-----------
 src/subcommand/filter_main.cpp | 11 ++++++--
 3 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/src/readfilter.cpp b/src/readfilter.cpp
index 34d6dffc842..e59d35f94ab 100644
--- a/src/readfilter.cpp
+++ b/src/readfilter.cpp
@@ -24,6 +24,8 @@ ostream& operator<<(ostream& os, const Counts& counts) {
        << "Min Quality Filter:            " << counts.counts[Counts::FilterName::min_mapq] << endl
        << "Min Base Quality Filter:       " << counts.counts[Counts::FilterName::min_base_qual] << endl
        << "Random Filter:                 " << counts.counts[Counts::FilterName::random] << endl
+       << "Annotation Filter:             " << counts.counts[Counts::FilterName::annotation] << endl
+       << "Incorrectly Mapped Filter:     " << counts.counts[Counts::FilterName::incorrectly_mapped] << endl
        << endl;
     return os;
 }
@@ -45,4 +47,16 @@ bool ReadFilter<MultipathAlignment>::is_mapped(const MultipathAlignment& mp_alig
     return false;
 }
 
+template<>
+bool ReadFilter<Alignment>::is_correctly_mapped(const Alignment& alignment) const {
+    return alignment.correctly_mapped();
+}
+
+//Looks like multipath alignments don't have a field for this
+template<>
+bool ReadFilter<MultipathAlignment>::is_correctly_mapped(const MultipathAlignment& alignment) const {
+    throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped"));
+    return true;
+}
+
 }
diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 2f2d2510498..882d27c222b 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -106,7 +106,10 @@ class ReadFilter{
     /// A string formatted "annotation[.subfield]*:value"
     /// Value is optional if the key is a flag
     /// Used like jq select
-    string annotation = "";
+    string annotation_to_match = "";
+
+    /// Filter to only correctly mapped reads
+    bool only_correctly_mapped = false;
       
     /**
      * Run all the filters on an alignment. The alignment may get modified in-place by the defray filter
@@ -254,6 +257,11 @@ class ReadFilter{
      */
      bool matches_annotation(const Read& read) const;
     
+    /**
+     * Check if the alignment is marked as being correctly mapped
+     */
+    bool is_correctly_mapped(const Read& read) const;
+
     /**
      * Write the read to stdout
      */
@@ -279,7 +287,7 @@ struct Counts {
     // note: "last" must be kept as the final value in this enum
     enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang,
         min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered,
-        proper_pair, unmapped, annotation, last};
+        proper_pair, unmapped, annotation, incorrectly_mapped, last};
     vector<size_t> counts;
     Counts () : counts(FilterName::last, 0) {}
     Counts& operator+=(const Counts& other) {
@@ -547,12 +555,19 @@ Counts ReadFilter<Read>::filter_alignment(Read& read) {
             keep = false;
         }
     }
-    if ((keep || verbose) && !annotation.empty()) {
+    if ((keep || verbose) && !annotation_to_match.empty()) {
         if (!matches_annotation(read)) {
             ++counts.counts[Counts::FilterName::annotation];
             keep = false;
         }
     }
+
+    if ((keep || verbose) && only_correctly_mapped) {
+        if (!is_correctly_mapped(read)) {
+            ++counts.counts[Counts::FilterName::incorrectly_mapped];
+            keep = false;
+        }
+    }
     
     if (!keep) {
         ++counts.counts[Counts::FilterName::filtered];
@@ -1311,23 +1326,23 @@ bool ReadFilter<Read>::matches_annotation(const Read& read) const {
 
     //This gets updated as needed
     google::protobuf::Struct read_annotation = read.annotation();
-    for (size_t end_i = 0 ; end_i < annotation.size() ; end_i++) {
-        if (annotation[end_i] == '.' || annotation[end_i] == ':') {
-            if (!read_annotation.fields().count(annotation.substr(start_i, end_i-start_i))) {
+    for (size_t end_i = 0 ; end_i < annotation_to_match.size() ; end_i++) {
+        if (annotation_to_match[end_i] == '.' || annotation_to_match[end_i] == ':') {
+            if (!read_annotation.fields().count(annotation_to_match.substr(start_i, end_i-start_i))) {
                 //If the annotation doesn't exist 
                 return false;
-            } else if ( end_i == annotation.size()-1) {
+            } else if ( end_i == annotation_to_match.size()-1) {
                 //If the annotation does exist and we only want to check if the read has it
                 return true;
-            } else if (annotation[end_i] == ':') {
+            } else if (annotation_to_match[end_i] == ':') {
                 //If this is the last annotation key and now we want to check the value
-                google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i));
+                google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i));
 
                 if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue &&
-                    regex_match(annotation.substr(end_i+1, annotation.size() - end_i - 1),
+                    regex_match(annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1),
                                 regex("([0-9]*\\.)?[0-9]*"))) {
                     //If the value is supposed to be a number
-                } else if (value_cast<string>(value) == annotation.substr(end_i+1, annotation.size() - end_i - 1)) {
+                } else if (value_cast<string>(value) == annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1)) {
                     //Otherwise assume that the value is a string
                     return true;
                 } else {
@@ -1335,16 +1350,16 @@ bool ReadFilter<Read>::matches_annotation(const Read& read) const {
                 }
             } else {
                 //Otherwise, this is the end of one annotation but it has a subfield next
-                google::protobuf::Value value= read_annotation.fields().at(annotation.substr(start_i, end_i-start_i));
+                google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i));
                 if (value.kind_case() != google::protobuf::Value::KindCase::kStructValue) {
                     //If this isn't a google::protobuf::Struct, then the input was bad
                     cerr << "warning: expected another annotation field after " 
-                         << annotation.substr(start_i, end_i-start_i) 
+                         << annotation_to_match.substr(start_i, end_i-start_i) 
                          << " but annotation ends with value " 
-                         << value_cast<string>(read_annotation.fields().at(annotation.substr(start_i, end_i-start_i))) << endl;
+                         << value_cast<string>(read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i))) << endl;
                     return false;
                 } else {
-                    read_annotation = read_annotation.fields().at(annotation.substr(start_i, end_i-start_i)).struct_value();
+                    read_annotation = read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)).struct_value();
                 }
             }
         }
@@ -1354,6 +1369,7 @@ bool ReadFilter<Read>::matches_annotation(const Read& read) const {
     
 }
 
+
 template<typename Read>
 bool ReadFilter<Read>::sample_read(const Read& read) const {
     // Decide if the alignment is paired.
diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp
index 0b625ca4940..b22c647c940 100644
--- a/src/subcommand/filter_main.cpp
+++ b/src/subcommand/filter_main.cpp
@@ -57,6 +57,7 @@ void help_filter(char** argv) {
          << "    -I, --interleaved-all      assume interleaved input. both ends will be filtered out if *both* fail filters" << endl
          << "    -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl
          << "    -B, --annotation K[:V]     keep reads if the annotation is present. If a value is given, keep reads if the value equal" << endl
+         << "    -c, --correctly-mapped     keep only reads that are marked as correctly-mapped" << endl
          << "    -U, --complement           apply the complement of the filter implied by the other arguments." << endl
          << "    -t, --threads N            number of threads [1]" << endl;
 }
@@ -107,6 +108,7 @@ int main_filter(int argc, char** argv) {
     bool only_proper_pairs = false;
     bool only_mapped = false;
     string annotation = "";
+    bool correctly_mapped = false;
 
     // What XG index, if any, should we load to support the other options?
     string xg_name;
@@ -144,13 +146,14 @@ int main_filter(int argc, char** argv) {
                 {"interleaved-all", no_argument, 0, 'I'},
                 {"min-base-quality", required_argument, 0, 'b'},
                 {"annotation", required_argument, 0, 'B'},
+                {"correctly-mapped", no_argument, 0, 'c'},
                 {"complement", no_argument, 0, 'U'},
                 {"threads", required_argument, 0, 't'},
                 {0, 0, 0, 0}
             };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:Ut:",
+        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:cUt:",
                          long_options, &option_index);
 
         /* Detect the end of the options. */
@@ -314,6 +317,9 @@ int main_filter(int argc, char** argv) {
         case 'B':
             annotation = optarg;
             break;
+        case 'c':
+            correctly_mapped = true;
+            break;
         case 'U':
             complement_filter = true;
             break;
@@ -409,7 +415,8 @@ int main_filter(int argc, char** argv) {
             filter.min_base_quality = min_base_quality;
             filter.min_base_quality_fraction = min_base_quality_fraction;
         }
-        filter.annotation = annotation;
+        filter.annotation_to_match = annotation;
+        filter.only_correctly_mapped = true;
         filter.complement_filter = complement_filter;
         filter.threads = get_thread_count();
         filter.graph = xindex;

From 146f1895d023e42a8114e04b1ae34c1c10b07ce0 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Fri, 12 Jan 2024 14:36:02 +0100
Subject: [PATCH 03/14] Simplify annotation matching and add tsv output to vg
 filter

---
 src/readfilter.hpp             | 117 +++++++++++++++++++++------------
 src/subcommand/filter_main.cpp |  24 ++++++-
 2 files changed, 96 insertions(+), 45 deletions(-)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 882d27c222b..d567a3698e2 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -90,6 +90,10 @@ class ReadFilter{
     /// Sometimes we only want a report, and not a filtered gam.  toggling off output
     /// speeds things up considerably.
     bool write_output = true;
+    /// Sometimes we want to pick out fields and write a tsv instead of a gam;
+    bool write_tsv = false;
+    /// What fields do we want to write to the tsv?
+    vector<string> output_fields;
     /// A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this)
     const HandleGraph* graph = nullptr;
     /// Interleaved input
@@ -272,6 +276,11 @@ class ReadFilter{
      */
     void emit(Read& read1, Read& read2);
 
+    /**
+     * Write a tsv line for a read to stdout
+     */
+     void emit_tsv(Read& read);
+
     
     
     /// The twp specializations have different writing infrastructure
@@ -346,7 +355,11 @@ void ReadFilter<Read>::filter_internal(istream* in) {
         Counts read_counts = filter_alignment(read);
         counts_vec[omp_get_thread_num()] += read_counts;
         if ((read_counts.keep() != complement_filter) && write_output) {
-            emit(read);
+            if (write_tsv) {
+                emit_tsv(read);
+            } else {
+                emit(read);
+            }
         }
     };
     
@@ -363,9 +376,25 @@ void ReadFilter<Read>::filter_internal(istream* in) {
         }
         counts_vec[omp_get_thread_num()] += read_counts;
         if ((read_counts.keep() != complement_filter) && write_output) {
-            emit(read1, read2);
+            if (write_tsv) {
+                emit_tsv(read1);
+                emit_tsv(read2);
+            } else {
+                emit(read1, read2);
+            }
         }
     };
+
+    if (write_tsv) {
+        //write a header for the tsv
+        for (size_t i = 0 ; i < output_fields.size() i++) {
+            const string& field = output_fields[i];
+            cout << field;
+            if (i == output_fields.size()-1) {
+                cout << "\t";
+            }
+        }
+    }
     
     if (interleaved) {
         vg::io::for_each_interleaved_pair_parallel(*in, pair_lambda);
@@ -1321,52 +1350,54 @@ bool ReadFilter<Read>::contains_subsequence(const Read& read) const {
 template<typename Read>
 bool ReadFilter<Read>::matches_annotation(const Read& read) const {
 
-    //Walk through the annotation until we find a . or :
-    size_t start_i = 0;
-
-    //This gets updated as needed
-    google::protobuf::Struct read_annotation = read.annotation();
-    for (size_t end_i = 0 ; end_i < annotation_to_match.size() ; end_i++) {
-        if (annotation_to_match[end_i] == '.' || annotation_to_match[end_i] == ':') {
-            if (!read_annotation.fields().count(annotation_to_match.substr(start_i, end_i-start_i))) {
-                //If the annotation doesn't exist 
-                return false;
-            } else if ( end_i == annotation_to_match.size()-1) {
-                //If the annotation does exist and we only want to check if the read has it
-                return true;
-            } else if (annotation_to_match[end_i] == ':') {
-                //If this is the last annotation key and now we want to check the value
-                google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i));
+    //Assume that there's only one level of annotations
+    size_t colon_pos = annotation_to_match.find(":");
+    if (colon_pos == string::npos) {
+        //If there was no colon, then just check for the existence of the annotation
+        return has_annotation(read, annotation_to_match);
+    } else {
+        string annotation_key = annotation_to_match.substr(0, colon_pos);
+        if (!has_annotation(read, annotation_key)) {
+            return false;
+        } else {
+            string annotation_val = annotation_to_match.substr(colon_pos+1, annotation_to_match.size() - colon_pos - 1);
+            return get_annotation<string>(read, annotation_key) == annotation_val; 
+        }
+    }
+    
+}
 
-                if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue &&
-                    regex_match(annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1),
-                                regex("([0-9]*\\.)?[0-9]*"))) {
-                    //If the value is supposed to be a number
-                } else if (value_cast<string>(value) == annotation_to_match.substr(end_i+1, annotation_to_match.size() - end_i - 1)) {
-                    //Otherwise assume that the value is a string
-                    return true;
-                } else {
-                    return false;
-                }
+template<typename Read>
+void ReadFilter<Read>::emit_tsv(Read& read) {
+    cout << endl;
+    for (size_t i = 0 ; i < output_fields.size() ; i++) {
+        const string& field = output_fields[i];
+        if (field == "name") {
+            cout << read.name();
+        } else if (field == "correctly_mapped") {
+            if (is_correctly_mapped(read)) {
+                cout << "True";
             } else {
-                //Otherwise, this is the end of one annotation but it has a subfield next
-                google::protobuf::Value value= read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i));
-                if (value.kind_case() != google::protobuf::Value::KindCase::kStructValue) {
-                    //If this isn't a google::protobuf::Struct, then the input was bad
-                    cerr << "warning: expected another annotation field after " 
-                         << annotation_to_match.substr(start_i, end_i-start_i) 
-                         << " but annotation ends with value " 
-                         << value_cast<string>(read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i))) << endl;
-                    return false;
-                } else {
-                    read_annotation = read_annotation.fields().at(annotation_to_match.substr(start_i, end_i-start_i)).struct_value();
-                }
+                cout << "False";
+            }
+        } else if (field == "mapping_quality") {
+            cout << get_mapq(read); 
+        } else if (field == "annotation") {
+            throw runtime_error("error: Cannot write all annotations");
+        } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") {
+            if (!has_annotation(read, field.substr(11, field.size()-11))) {
+                throw runtime_error("error: Cannot find annotation "+ field);
+            } else {
+                cout << get_annotation<string>(read, field.substr(11, field.size()-11));
             }
+        } else {
+            cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl;
+            throw runtime_error("error: Writing non-existent field to tsv: " + field);
+        }
+        if (i != output_fields.size()-1) {
+            cout << "\t";
         }
     }
-    //If we got through then we must have found the annotations
-    return true;
-    
 }
 
 
diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp
index b22c647c940..dae80d2c8e1 100644
--- a/src/subcommand/filter_main.cpp
+++ b/src/subcommand/filter_main.cpp
@@ -48,6 +48,7 @@ void help_filter(char** argv) {
          << "    -x, --xg-name FILE         use this xg index or graph (required for -S and -D)" << endl
          << "    -v, --verbose              print out statistics on numbers of reads filtered by what." << endl
          << "    -V, --no-output            print out statistics (as above) but do not write out filtered GAM." << endl
+         << "    -T, --tsv-out FIELD[;FIELD] do not write filtered gam but a tsv of the given fields" << endl
          << "    -q, --min-mapq N           filter alignments with mapping quality < N" << endl
          << "    -E, --repeat-ends N        filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end" << endl
          << "    -D, --defray-ends N        clip back the ends of reads that are ambiguously aligned, up to N bases" << endl
@@ -56,7 +57,8 @@ void help_filter(char** argv) {
          << "    -i, --interleaved          assume interleaved input. both ends will be filtered out if either fails filter" << endl
          << "    -I, --interleaved-all      assume interleaved input. both ends will be filtered out if *both* fail filters" << endl
          << "    -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl
-         << "    -B, --annotation K[:V]     keep reads if the annotation is present. If a value is given, keep reads if the value equal" << endl
+         << "    -B, --annotation K[:V]     keep reads if the annotation is present. If a value is given, keep reads if the values are equal" << endl
+         << "                               similar to running jq 'select(.annotation.K==V)' on the json" << endl 
          << "    -c, --correctly-mapped     keep only reads that are marked as correctly-mapped" << endl
          << "    -U, --complement           apply the complement of the filter implied by the other arguments." << endl
          << "    -t, --threads N            number of threads [1]" << endl;
@@ -108,6 +110,7 @@ int main_filter(int argc, char** argv) {
     bool only_proper_pairs = false;
     bool only_mapped = false;
     string annotation = "";
+    string output_fields = "";
     bool correctly_mapped = false;
 
     // What XG index, if any, should we load to support the other options?
@@ -137,6 +140,7 @@ int main_filter(int argc, char** argv) {
                 {"drop-split",  no_argument, 0, 'S'},
                 {"xg-name", required_argument, 0, 'x'},
                 {"verbose",  no_argument, 0, 'v'},
+                {"tsv-out",  no_argument, 0, 'T'},
                 {"min-mapq", required_argument, 0, 'q'},
                 {"repeat-ends", required_argument, 0, 'E'},
                 {"defray-ends", required_argument, 0, 'D'},
@@ -153,7 +157,7 @@ int main_filter(int argc, char** argv) {
             };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:B:cUt:",
+        c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:",
                          long_options, &option_index);
 
         /* Detect the end of the options. */
@@ -250,6 +254,9 @@ int main_filter(int argc, char** argv) {
             verbose = true;
             write_output = false;
             break;
+        case 'T':
+            output_fields=optarg;
+            break;
         case 'E':
             set_repeat_size = true;
             repeat_size = parse<int>(optarg);
@@ -387,6 +394,19 @@ int main_filter(int argc, char** argv) {
         }
         filter.verbose = verbose;
         filter.write_output = write_output;
+
+
+        if (!output_fields.empty()){
+            //Get the fields for tsv output
+            filter.write_tsv = true;
+            size_t start_i = 0;
+            for (size_t end_i = 0 ; end_i <= output_fields.size() ; end_i++) {
+                if (end_i == output_fields.size() || output_fields[end_i] == ';') {
+                    filter.output_fields.emplace_back(output_fields.substr(start_i, end_i-start_i));
+                    start_i = end_i + 1;
+                }
+            }
+        }
         if (set_repeat_size) {
             filter.repeat_size = repeat_size;
         }

From e84b0b5d24f6f29973e5d530c3053cdefa8ddbfb Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Fri, 12 Jan 2024 15:00:41 +0100
Subject: [PATCH 04/14] Fix typo

---
 src/readfilter.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index d567a3698e2..7b72da5c708 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -387,7 +387,7 @@ void ReadFilter<Read>::filter_internal(istream* in) {
 
     if (write_tsv) {
         //write a header for the tsv
-        for (size_t i = 0 ; i < output_fields.size() i++) {
+        for (size_t i = 0 ; i < output_fields.size() ; i++) {
             const string& field = output_fields[i];
             cout << field;
             if (i == output_fields.size()-1) {

From fc0f90fabaabf866ec5ee984503f5fb39322f9a2 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Mon, 22 Jan 2024 17:03:43 +0100
Subject: [PATCH 05/14] Write tsv header properly

---
 src/readfilter.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 7b72da5c708..5309af0ed4e 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -390,7 +390,7 @@ void ReadFilter<Read>::filter_internal(istream* in) {
         for (size_t i = 0 ; i < output_fields.size() ; i++) {
             const string& field = output_fields[i];
             cout << field;
-            if (i == output_fields.size()-1) {
+            if (i != output_fields.size()-1) {
                 cout << "\t";
             }
         }

From a8d1eaf50aa54fd1994856beebd43a0d0e3877f3 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Wed, 24 Jan 2024 14:33:28 +0100
Subject: [PATCH 06/14] Try to make tsv output parallel

---
 src/readfilter.hpp | 53 +++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 5309af0ed4e..5152750e2bd 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -1369,34 +1369,39 @@ bool ReadFilter<Read>::matches_annotation(const Read& read) const {
 
 template<typename Read>
 void ReadFilter<Read>::emit_tsv(Read& read) {
-    cout << endl;
-    for (size_t i = 0 ; i < output_fields.size() ; i++) {
-        const string& field = output_fields[i];
-        if (field == "name") {
-            cout << read.name();
-        } else if (field == "correctly_mapped") {
-            if (is_correctly_mapped(read)) {
-                cout << "True";
+#pragma omp critical (cout)
+    {
+
+        cout << endl;
+        for (size_t i = 0 ; i < output_fields.size() ; i++) {
+            const string& field = output_fields[i];
+            if (field == "name") {
+                cout << read.name();
+            } else if (field == "correctly_mapped") {
+                if (is_correctly_mapped(read)) {
+                    cout << "True";
+                } else {
+                    cout << "False";
+                }
+            } else if (field == "mapping_quality") {
+                cout << get_mapq(read); 
+            } else if (field == "annotation") {
+                throw runtime_error("error: Cannot write all annotations");
+            } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") {
+                if (!has_annotation(read, field.substr(11, field.size()-11))) {
+                    throw runtime_error("error: Cannot find annotation "+ field);
+                } else {
+                    cout << get_annotation<string>(read, field.substr(11, field.size()-11));
+                }
             } else {
-                cout << "False";
+                cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl;
+                throw runtime_error("error: Writing non-existent field to tsv: " + field);
             }
-        } else if (field == "mapping_quality") {
-            cout << get_mapq(read); 
-        } else if (field == "annotation") {
-            throw runtime_error("error: Cannot write all annotations");
-        } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") {
-            if (!has_annotation(read, field.substr(11, field.size()-11))) {
-                throw runtime_error("error: Cannot find annotation "+ field);
-            } else {
-                cout << get_annotation<string>(read, field.substr(11, field.size()-11));
+            if (i != output_fields.size()-1) {
+                cout << "\t";
             }
-        } else {
-            cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl;
-            throw runtime_error("error: Writing non-existent field to tsv: " + field);
-        }
-        if (i != output_fields.size()-1) {
-            cout << "\t";
         }
+
     }
 }
 

From 4b74f91fb85f87ddd7148452212b0b7cecfc8fdf Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Wed, 24 Jan 2024 15:07:18 +0100
Subject: [PATCH 07/14] Turn off correctly mapped filter

---
 src/subcommand/filter_main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp
index dae80d2c8e1..b54642fabfc 100644
--- a/src/subcommand/filter_main.cpp
+++ b/src/subcommand/filter_main.cpp
@@ -436,7 +436,7 @@ int main_filter(int argc, char** argv) {
             filter.min_base_quality_fraction = min_base_quality_fraction;
         }
         filter.annotation_to_match = annotation;
-        filter.only_correctly_mapped = true;
+        filter.only_correctly_mapped = correctly_mapped;
         filter.complement_filter = complement_filter;
         filter.threads = get_thread_count();
         filter.graph = xindex;

From 7a8c38cd5f46fc0c0ae410e2facc3204cab1ace5 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Wed, 24 Jan 2024 17:54:07 +0100
Subject: [PATCH 08/14] Don't make a alignment emitter when writing tsv

---
 src/subcommand/filter_main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp
index b54642fabfc..af1c5678cbb 100644
--- a/src/subcommand/filter_main.cpp
+++ b/src/subcommand/filter_main.cpp
@@ -399,6 +399,7 @@ int main_filter(int argc, char** argv) {
         if (!output_fields.empty()){
             //Get the fields for tsv output
             filter.write_tsv = true;
+            filter.write_output = false;
             size_t start_i = 0;
             for (size_t end_i = 0 ; end_i <= output_fields.size() ; end_i++) {
                 if (end_i == output_fields.size() || output_fields[end_i] == ';') {

From 837fa357c1847878413c760d55cca188152b5f42 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Wed, 24 Jan 2024 18:00:55 +0100
Subject: [PATCH 09/14] Write tsv output

---
 src/readfilter.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 5152750e2bd..bc366387e3e 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -354,7 +354,7 @@ void ReadFilter<Read>::filter_internal(istream* in) {
 #endif
         Counts read_counts = filter_alignment(read);
         counts_vec[omp_get_thread_num()] += read_counts;
-        if ((read_counts.keep() != complement_filter) && write_output) {
+        if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) {
             if (write_tsv) {
                 emit_tsv(read);
             } else {
@@ -375,7 +375,7 @@ void ReadFilter<Read>::filter_internal(istream* in) {
             read_counts.set_paired_any();
         }
         counts_vec[omp_get_thread_num()] += read_counts;
-        if ((read_counts.keep() != complement_filter) && write_output) {
+        if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) {
             if (write_tsv) {
                 emit_tsv(read1);
                 emit_tsv(read2);

From 951db07cfa62df9cdbb372af7721db0453697cdd Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Wed, 24 Jan 2024 22:33:50 +0100
Subject: [PATCH 10/14] Try getting annotations with different types

---
 src/readfilter.hpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index bc366387e3e..f4c9fd2fca7 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -1360,8 +1360,16 @@ bool ReadFilter<Read>::matches_annotation(const Read& read) const {
         if (!has_annotation(read, annotation_key)) {
             return false;
         } else {
+            google::protobuf::Value value = read.annotation().fields().at(annotation_key);
             string annotation_val = annotation_to_match.substr(colon_pos+1, annotation_to_match.size() - colon_pos - 1);
-            return get_annotation<string>(read, annotation_key) == annotation_val; 
+            if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) {
+                double read_value = get_annotation<double>(read, annotation_key);
+                return read_value == std::stod(annotation_val);
+            } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) {
+                return get_annotation<string>(read, annotation_key) == annotation_val; 
+            } else {
+                throw runtime_error("error: Cannot check equality of annotation " + annotation_to_match);
+            }
         }
     }
     
@@ -1391,7 +1399,16 @@ void ReadFilter<Read>::emit_tsv(Read& read) {
                 if (!has_annotation(read, field.substr(11, field.size()-11))) {
                     throw runtime_error("error: Cannot find annotation "+ field);
                 } else {
-                    cout << get_annotation<string>(read, field.substr(11, field.size()-11));
+                    string annotation_key = field.substr(11, field.size()-11);
+                    google::protobuf::Value value = read.annotation().fields().at(annotation_key);
+
+                    if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) {
+                        cout << get_annotation<double>(read, annotation_key);
+                    } else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) {
+                        cout << get_annotation<string>(read, annotation_key);
+                    } else {
+                        cout << "?";
+                    }
                 }
             } else {
                 cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl;

From 9a0a835553f534379686c2e01b52c6669b837a58 Mon Sep 17 00:00:00 2001
From: Xian Chang <xhchang@ucsc.edu>
Date: Mon, 26 Feb 2024 13:59:57 -0800
Subject: [PATCH 11/14] Add sequence to tsv output

---
 src/readfilter.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index f4c9fd2fca7..43aab1fed76 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -387,6 +387,7 @@ void ReadFilter<Read>::filter_internal(istream* in) {
 
     if (write_tsv) {
         //write a header for the tsv
+        cout << "#";
         for (size_t i = 0 ; i < output_fields.size() ; i++) {
             const string& field = output_fields[i];
             cout << field;
@@ -1393,6 +1394,8 @@ void ReadFilter<Read>::emit_tsv(Read& read) {
                 }
             } else if (field == "mapping_quality") {
                 cout << get_mapq(read); 
+            } else if (field == "sequence") {
+                cout << read.sequence(); 
             } else if (field == "annotation") {
                 throw runtime_error("error: Cannot write all annotations");
             } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") {

From 1e600baab0e16f71f3b4e76d88544eeab99f19d1 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Thu, 29 Feb 2024 21:06:44 +0100
Subject: [PATCH 12/14] Add time_used

---
 src/readfilter.cpp | 11 -----------
 src/readfilter.hpp | 22 ++++++++++++++++++++--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/readfilter.cpp b/src/readfilter.cpp
index e59d35f94ab..f24e84ce127 100644
--- a/src/readfilter.cpp
+++ b/src/readfilter.cpp
@@ -47,16 +47,5 @@ bool ReadFilter<MultipathAlignment>::is_mapped(const MultipathAlignment& mp_alig
     return false;
 }
 
-template<>
-bool ReadFilter<Alignment>::is_correctly_mapped(const Alignment& alignment) const {
-    return alignment.correctly_mapped();
-}
-
-//Looks like multipath alignments don't have a field for this
-template<>
-bool ReadFilter<MultipathAlignment>::is_correctly_mapped(const MultipathAlignment& alignment) const {
-    throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped"));
-    return true;
-}
 
 }
diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 43aab1fed76..7876cfb5479 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -451,6 +451,18 @@ inline int ReadFilter<MultipathAlignment>::filter(istream* alignment_stream) {
     
     return 0;
 }
+template<>
+inline bool ReadFilter<Alignment>::is_correctly_mapped(const Alignment& alignment) const {
+    return alignment.correctly_mapped();
+}
+
+//Looks like multipath alignments don't have a field for this
+template<>
+inline bool ReadFilter<MultipathAlignment>::is_correctly_mapped(const MultipathAlignment& alignment) const {
+    throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped"));
+    return true;
+}
+
 
 template<>
 inline void ReadFilter<Alignment>::emit(Alignment& aln) {
@@ -1376,8 +1388,12 @@ bool ReadFilter<Read>::matches_annotation(const Read& read) const {
     
 }
 
-template<typename Read>
-void ReadFilter<Read>::emit_tsv(Read& read) {
+template<>
+inline void ReadFilter<MultipathAlignment>::emit_tsv(MultipathAlignment& read) {
+    return;
+}
+template<>
+inline void ReadFilter<Alignment>::emit_tsv(Alignment& read) {
 #pragma omp critical (cout)
     {
 
@@ -1396,6 +1412,8 @@ void ReadFilter<Read>::emit_tsv(Read& read) {
                 cout << get_mapq(read); 
             } else if (field == "sequence") {
                 cout << read.sequence(); 
+            } else if (field == "time_used") {
+                cout << read.time_used();
             } else if (field == "annotation") {
                 throw runtime_error("error: Cannot write all annotations");
             } else if (field.size() > 11 && field.substr(0, 11) == "annotation.") {

From 3de54147056064e30132a251067e975cb1fcc021 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Fri, 1 Mar 2024 14:22:28 +0100
Subject: [PATCH 13/14] Add correctness tsv output that also checks
 off-reference

---
 src/readfilter.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 7876cfb5479..71e01590e52 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -1408,6 +1408,14 @@ inline void ReadFilter<Alignment>::emit_tsv(Alignment& read) {
                 } else {
                     cout << "False";
                 }
+            } else if (field == "correctness") {
+                if (is_correctly_mapped(read)) {
+                    cout << "correct";
+                } else if (has_annotation(read, "no_truth") && get_annotation<bool>(read, "no_truth")) {
+                    cout << "off-reference";
+                } else {
+                    cout << "incorrect";
+                }
             } else if (field == "mapping_quality") {
                 cout << get_mapq(read); 
             } else if (field == "sequence") {

From 69c55b68d8b817f8eafdfa05ffd790d09f0a6c17 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Fri, 1 Mar 2024 14:35:55 +0100
Subject: [PATCH 14/14] Add check for boolean annotations

---
 src/readfilter.hpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 71e01590e52..7c0fab58daa 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -1367,7 +1367,16 @@ bool ReadFilter<Read>::matches_annotation(const Read& read) const {
     size_t colon_pos = annotation_to_match.find(":");
     if (colon_pos == string::npos) {
         //If there was no colon, then just check for the existence of the annotation
-        return has_annotation(read, annotation_to_match);
+        // or, if it is a boolean value, check that it's true
+        if (!has_annotation(read, annotation_to_match)) {
+            return false;
+        }
+        google::protobuf::Value value = read.annotation().fields().at(annotation_to_match);
+        if (value.kind_case() == google::protobuf::Value::KindCase::kBoolValue) {
+            return get_annotation<bool>(read, annotation_to_match);
+        } else {
+            return true;
+        }
     } else {
         string annotation_key = annotation_to_match.substr(0, colon_pos);
         if (!has_annotation(read, annotation_key)) {