Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter reads #4234

Merged
merged 15 commits into from
Mar 2, 2024
Merged
3 changes: 3 additions & 0 deletions src/readfilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ ostream& operator<<(ostream& os, const Counts& counts) {
<< "Min Quality Filter: " << counts.counts[Counts::FilterName::min_mapq] << endl
<< "Min Base Quality Filter: " << counts.counts[Counts::FilterName::min_base_qual] << endl
<< "Random Filter: " << counts.counts[Counts::FilterName::random] << endl
<< "Annotation Filter: " << counts.counts[Counts::FilterName::annotation] << endl
<< "Incorrectly Mapped Filter: " << counts.counts[Counts::FilterName::incorrectly_mapped] << endl
<< endl;
return os;
}
Expand All @@ -45,4 +47,5 @@ bool ReadFilter<MultipathAlignment>::is_mapped(const MultipathAlignment& mp_alig
return false;
}


}
185 changes: 180 additions & 5 deletions src/readfilter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ class ReadFilter{
/// Sometimes we only want a report, and not a filtered gam. toggling off output
/// speeds things up considerably.
bool write_output = true;
/// Sometimes we want to pick out fields and write a tsv instead of a gam;
bool write_tsv = false;
/// What fields do we want to write to the tsv?
vector<string> output_fields;
/// A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this)
const HandleGraph* graph = nullptr;
/// Interleaved input
Expand All @@ -102,6 +106,14 @@ class ReadFilter{
int min_base_quality = numeric_limits<int>::min() / 2;
// minimum fraction of bases in reads that must have quality at least <min_base_quality>
double min_base_quality_fraction = numeric_limits<double>::lowest();

/// A string formatted "annotation[.subfield]*:value"
/// Value is optional if the key is a flag
/// Used like jq select
string annotation_to_match = "";

/// Filter to only correctly mapped reads
bool only_correctly_mapped = false;

/**
* Run all the filters on an alignment. The alignment may get modified in-place by the defray filter
Expand Down Expand Up @@ -243,7 +255,17 @@ class ReadFilter{
* Does the read contain at least one of the indicated sequences
*/
bool contains_subsequence(const Read& read) const;

/**
* Does has the given annotation and does it match
*/
bool matches_annotation(const Read& read) const;

/**
* Check if the alignment is marked as being correctly mapped
*/
bool is_correctly_mapped(const Read& read) const;

/**
* Write the read to stdout
*/
Expand All @@ -253,6 +275,12 @@ class ReadFilter{
* Write a read pair to stdout
*/
void emit(Read& read1, Read& read2);

/**
* Write a tsv line for a read to stdout
*/
void emit_tsv(Read& read);



/// The twp specializations have different writing infrastructure
Expand All @@ -268,7 +296,7 @@ struct Counts {
// note: "last" must be kept as the final value in this enum
enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang,
min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered,
proper_pair, unmapped, last};
proper_pair, unmapped, annotation, incorrectly_mapped, last};
vector<size_t> counts;
Counts () : counts(FilterName::last, 0) {}
Counts& operator+=(const Counts& other) {
Expand Down Expand Up @@ -326,8 +354,12 @@ void ReadFilter<Read>::filter_internal(istream* in) {
#endif
Counts read_counts = filter_alignment(read);
counts_vec[omp_get_thread_num()] += read_counts;
if ((read_counts.keep() != complement_filter) && write_output) {
emit(read);
if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) {
if (write_tsv) {
emit_tsv(read);
} else {
emit(read);
}
}
};

Expand All @@ -343,10 +375,27 @@ void ReadFilter<Read>::filter_internal(istream* in) {
read_counts.set_paired_any();
}
counts_vec[omp_get_thread_num()] += read_counts;
if ((read_counts.keep() != complement_filter) && write_output) {
emit(read1, read2);
if ((read_counts.keep() != complement_filter) && (write_output || write_tsv)) {
if (write_tsv) {
emit_tsv(read1);
emit_tsv(read2);
} else {
emit(read1, read2);
}
}
};

if (write_tsv) {
//write a header for the tsv
cout << "#";
for (size_t i = 0 ; i < output_fields.size() ; i++) {
const string& field = output_fields[i];
cout << field;
if (i != output_fields.size()-1) {
cout << "\t";
}
}
}

if (interleaved) {
vg::io::for_each_interleaved_pair_parallel(*in, pair_lambda);
Expand Down Expand Up @@ -402,6 +451,18 @@ inline int ReadFilter<MultipathAlignment>::filter(istream* alignment_stream) {

return 0;
}
template<>
inline bool ReadFilter<Alignment>::is_correctly_mapped(const Alignment& alignment) const {
return alignment.correctly_mapped();
}

//Looks like multipath alignments don't have a field for this
template<>
inline bool ReadFilter<MultipathAlignment>::is_correctly_mapped(const MultipathAlignment& alignment) const {
throw(std::runtime_error("error: multipath alignments don't have a field correctly_mapped"));
return true;
}


template<>
inline void ReadFilter<Alignment>::emit(Alignment& aln) {
Expand Down Expand Up @@ -536,6 +597,19 @@ Counts ReadFilter<Read>::filter_alignment(Read& read) {
keep = false;
}
}
if ((keep || verbose) && !annotation_to_match.empty()) {
if (!matches_annotation(read)) {
++counts.counts[Counts::FilterName::annotation];
keep = false;
}
}

if ((keep || verbose) && only_correctly_mapped) {
if (!is_correctly_mapped(read)) {
++counts.counts[Counts::FilterName::incorrectly_mapped];
keep = false;
}
}

if (!keep) {
++counts.counts[Counts::FilterName::filtered];
Expand Down Expand Up @@ -1286,6 +1360,107 @@ bool ReadFilter<Read>::contains_subsequence(const Read& read) const {
return found;
}

template<typename Read>
bool ReadFilter<Read>::matches_annotation(const Read& read) const {

//Assume that there's only one level of annotations
size_t colon_pos = annotation_to_match.find(":");
if (colon_pos == string::npos) {
//If there was no colon, then just check for the existence of the annotation
// or, if it is a boolean value, check that it's true
if (!has_annotation(read, annotation_to_match)) {
return false;
}
google::protobuf::Value value = read.annotation().fields().at(annotation_to_match);
if (value.kind_case() == google::protobuf::Value::KindCase::kBoolValue) {
return get_annotation<bool>(read, annotation_to_match);
} else {
return true;
}
} else {
string annotation_key = annotation_to_match.substr(0, colon_pos);
if (!has_annotation(read, annotation_key)) {
return false;
} else {
google::protobuf::Value value = read.annotation().fields().at(annotation_key);
string annotation_val = annotation_to_match.substr(colon_pos+1, annotation_to_match.size() - colon_pos - 1);
if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) {
double read_value = get_annotation<double>(read, annotation_key);
return read_value == std::stod(annotation_val);
} else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) {
return get_annotation<string>(read, annotation_key) == annotation_val;
} else {
throw runtime_error("error: Cannot check equality of annotation " + annotation_to_match);
}
}
}

}

template<>
inline void ReadFilter<MultipathAlignment>::emit_tsv(MultipathAlignment& read) {
return;
}
template<>
inline void ReadFilter<Alignment>::emit_tsv(Alignment& read) {
#pragma omp critical (cout)
{

cout << endl;
for (size_t i = 0 ; i < output_fields.size() ; i++) {
const string& field = output_fields[i];
if (field == "name") {
cout << read.name();
} else if (field == "correctly_mapped") {
if (is_correctly_mapped(read)) {
cout << "True";
} else {
cout << "False";
}
} else if (field == "correctness") {
if (is_correctly_mapped(read)) {
cout << "correct";
} else if (has_annotation(read, "no_truth") && get_annotation<bool>(read, "no_truth")) {
cout << "off-reference";
} else {
cout << "incorrect";
}
} else if (field == "mapping_quality") {
cout << get_mapq(read);
} else if (field == "sequence") {
cout << read.sequence();
} else if (field == "time_used") {
cout << read.time_used();
} else if (field == "annotation") {
throw runtime_error("error: Cannot write all annotations");
} else if (field.size() > 11 && field.substr(0, 11) == "annotation.") {
if (!has_annotation(read, field.substr(11, field.size()-11))) {
throw runtime_error("error: Cannot find annotation "+ field);
} else {
string annotation_key = field.substr(11, field.size()-11);
google::protobuf::Value value = read.annotation().fields().at(annotation_key);

if (value.kind_case() == google::protobuf::Value::KindCase::kNumberValue) {
cout << get_annotation<double>(read, annotation_key);
} else if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) {
cout << get_annotation<string>(read, annotation_key);
} else {
cout << "?";
}
}
} else {
cerr << "I didn't implement all fields for tsv's so if I missed something let me know and I'll add it -Xian" << endl;
throw runtime_error("error: Writing non-existent field to tsv: " + field);
}
if (i != output_fields.size()-1) {
cout << "\t";
}
}

}
}


template<typename Read>
bool ReadFilter<Read>::sample_read(const Read& read) const {
// Decide if the alignment is paired.
Expand Down
37 changes: 36 additions & 1 deletion src/subcommand/filter_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ void help_filter(char** argv) {
<< " -x, --xg-name FILE use this xg index or graph (required for -S and -D)" << endl
<< " -v, --verbose print out statistics on numbers of reads filtered by what." << endl
<< " -V, --no-output print out statistics (as above) but do not write out filtered GAM." << endl
<< " -T, --tsv-out FIELD[;FIELD] do not write filtered gam but a tsv of the given fields" << endl
<< " -q, --min-mapq N filter alignments with mapping quality < N" << endl
<< " -E, --repeat-ends N filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end" << endl
<< " -D, --defray-ends N clip back the ends of reads that are ambiguously aligned, up to N bases" << endl
Expand All @@ -56,6 +57,9 @@ void help_filter(char** argv) {
<< " -i, --interleaved assume interleaved input. both ends will be filtered out if either fails filter" << endl
<< " -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters" << endl
<< " -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl
<< " -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the values are equal" << endl
<< " similar to running jq 'select(.annotation.K==V)' on the json" << endl
<< " -c, --correctly-mapped keep only reads that are marked as correctly-mapped" << endl
<< " -U, --complement apply the complement of the filter implied by the other arguments." << endl
<< " -t, --threads N number of threads [1]" << endl;
}
Expand Down Expand Up @@ -105,6 +109,9 @@ int main_filter(int argc, char** argv) {
bool complement_filter = false;
bool only_proper_pairs = false;
bool only_mapped = false;
string annotation = "";
string output_fields = "";
bool correctly_mapped = false;

// What XG index, if any, should we load to support the other options?
string xg_name;
Expand Down Expand Up @@ -133,6 +140,7 @@ int main_filter(int argc, char** argv) {
{"drop-split", no_argument, 0, 'S'},
{"xg-name", required_argument, 0, 'x'},
{"verbose", no_argument, 0, 'v'},
{"tsv-out", no_argument, 0, 'T'},
{"min-mapq", required_argument, 0, 'q'},
{"repeat-ends", required_argument, 0, 'E'},
{"defray-ends", required_argument, 0, 'D'},
Expand All @@ -141,13 +149,15 @@ int main_filter(int argc, char** argv) {
{"interleaved", no_argument, 0, 'i'},
{"interleaved-all", no_argument, 0, 'I'},
{"min-base-quality", required_argument, 0, 'b'},
{"annotation", required_argument, 0, 'B'},
{"correctly-mapped", no_argument, 0, 'c'},
{"complement", no_argument, 0, 'U'},
{"threads", required_argument, 0, 't'},
{0, 0, 0, 0}
};

int option_index = 0;
c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:Ut:",
c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVT:q:E:D:C:d:iIb:B:cUt:",
long_options, &option_index);

/* Detect the end of the options. */
Expand Down Expand Up @@ -244,6 +254,9 @@ int main_filter(int argc, char** argv) {
verbose = true;
write_output = false;
break;
case 'T':
output_fields=optarg;
break;
case 'E':
set_repeat_size = true;
repeat_size = parse<int>(optarg);
Expand Down Expand Up @@ -308,6 +321,12 @@ int main_filter(int argc, char** argv) {
}
}
break;
case 'B':
annotation = optarg;
break;
case 'c':
correctly_mapped = true;
break;
case 'U':
complement_filter = true;
break;
Expand Down Expand Up @@ -375,6 +394,20 @@ int main_filter(int argc, char** argv) {
}
filter.verbose = verbose;
filter.write_output = write_output;


if (!output_fields.empty()){
//Get the fields for tsv output
filter.write_tsv = true;
filter.write_output = false;
size_t start_i = 0;
for (size_t end_i = 0 ; end_i <= output_fields.size() ; end_i++) {
if (end_i == output_fields.size() || output_fields[end_i] == ';') {
filter.output_fields.emplace_back(output_fields.substr(start_i, end_i-start_i));
start_i = end_i + 1;
}
}
}
if (set_repeat_size) {
filter.repeat_size = repeat_size;
}
Expand Down Expand Up @@ -403,6 +436,8 @@ int main_filter(int argc, char** argv) {
filter.min_base_quality = min_base_quality;
filter.min_base_quality_fraction = min_base_quality_fraction;
}
filter.annotation_to_match = annotation;
filter.only_correctly_mapped = correctly_mapped;
filter.complement_filter = complement_filter;
filter.threads = get_thread_count();
filter.graph = xindex;
Expand Down
Loading