From 3133604b8f7433ec819c13964cf78e6e05660c01 Mon Sep 17 00:00:00 2001 From: Adrian Tan Date: Mon, 5 May 2014 17:46:16 -0400 Subject: [PATCH] various changes to simplify overlap matching with features of interest --- Makefile | 4 +- annotate_variants.cpp | 58 ++++++++++----------- annotate_variants.h | 1 + bed.cpp | 24 +++++++++ bed.h | 65 +++++++++++++++++++++++ genome_interval.cpp | 8 +++ genome_interval.h | 7 ++- overlap_region_matcher.cpp | 25 +++++++++ overlap_region_matcher.h | 102 +++++++++++++++++++++++++++++++++++++ tbx_ordered_reader.cpp | 33 +++++++++++- tbx_ordered_reader.h | 11 +++- 11 files changed, 303 insertions(+), 35 deletions(-) create mode 100644 bed.cpp create mode 100644 bed.h create mode 100644 overlap_region_matcher.cpp create mode 100644 overlap_region_matcher.h diff --git a/Makefile b/Makefile index 56fcf0b..63016bc 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,9 @@ SOURCES = program\ estimator\ profile_afs\ profile_hwe\ - profile_len + profile_len\ + overlap_region_matcher\ + bed SOURCESONLY = main.cpp diff --git a/annotate_variants.cpp b/annotate_variants.cpp index ee6ef96..ac9106b 100644 --- a/annotate_variants.cpp +++ b/annotate_variants.cpp @@ -40,7 +40,7 @@ class Igor : Program std::string output_vcf_file; std::vector intervals; std::string interval_list; - std::string gencode_gtf_file; + std::string gencode_gtf_file; bool annotate_coding; /////// @@ -48,6 +48,7 @@ class Igor : Program /////// BCFOrderedReader *odr; BCFOrderedWriter *odw; + OverlapRegionMatcher *orm; ///////// //stats// @@ -114,19 +115,20 @@ class Igor : Program // bcf_hdr_append(odw->hdr, "##INFO="); // bcf_hdr_append(odw->hdr, "##INFO="); // bcf_hdr_append(odw->hdr, "##INFO="); + bcf_hdr_append(odw->hdr, "##INFO="); /////////////////////// //tool initialization// /////////////////////// vm = new VariantManip(ref_fasta_file); - + if (annotate_coding) - { + { bcf_hdr_append(odw->hdr, "##INFO="); bcf_hdr_append(odw->hdr, "##INFO="); gc = new GENCODE(gencode_gtf_file, ref_fasta_file); } - + //////////////////////// //stats initialization// //////////////////////// @@ -155,6 +157,9 @@ class Igor : Program { odw->write_hdr(); + std::string str = "/net/fantasia/home/atks/ref/vt/grch37/mdust.bed.gz"; + orm = new OverlapRegionMatcher(str); + bcf1_t *v = odw->get_bcf1_from_pool(); std::vector overlaps; Variant variant; @@ -166,40 +171,33 @@ class Igor : Program std::string chrom = bcf_get_chrom(odr->hdr,v); int32_t start1 = bcf_get_pos1(v); int32_t end1 = bcf_get_end_pos1(v); - + vm->vtype2string(vtype, &s); if (s.l) - { + { bcf_update_info_string(odr->hdr, v, "VT", s.s); } - + if (vtype==VT_SNP) { //synonymous and non synonymous annotation - - } + + } else if (vtype&VT_INDEL) { + if (orm->overlaps_with(chrom, start1+1, end1)) + { + bcf_update_info_flag(odr->hdr, v, "LOW_COMPLEXITY", "", 1); + } + //frame shift annotation if (annotate_coding) { -// if (gc->overlaps_with(chrom, start1+1, end1, GC_FT_CDS)) -// { -// if (variant.exists_frame_shift()) -// { -// bcf_update_info_flag(odr->hdr, v, "GENCODE_FS", "", 1); -// } -// else -// { -// bcf_update_info_flag(odr->hdr, v, "GENCODE_NFS", "", 1); -// } -// } - gc->search(chrom, start1+1, end1, overlaps); - + bool cds_found = false; bool is_fs = false; - + for (int32_t i=0; ihdr, v, "GENCODE_NFS", "", 1); } } - - //classify STR + + //classify STR std::string ru = "ACGT"; int32_t rl = 4; } // bcf_update_info_string(odr->hdr, v, "RU", ru.c_str()); - // bcf_update_info_int32(odr->hdr, v, "RL", &rl, 1); + // bcf_update_info_int32(odr->hdr, v, "RL", &rl, 1); } - + ++no_variants_annotated; odw->write(v); v = odw->get_bcf1_from_pool(); } - + odw->close(); }; - + private: }; diff --git a/annotate_variants.h b/annotate_variants.h index 6e102a2..f9c82d6 100644 --- a/annotate_variants.h +++ b/annotate_variants.h @@ -48,6 +48,7 @@ #include "variant_manip.h" #include "log_tool.h" #include "gencode.h" +#include "overlap_region_matcher.h" void annotate_variants(int argc, char ** argv); diff --git a/bed.cpp b/bed.cpp new file mode 100644 index 0000000..0b73d20 --- /dev/null +++ b/bed.cpp @@ -0,0 +1,24 @@ +/* The MIT License + +Copyright (c) 2014 Adrian Tan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "bed.h" diff --git a/bed.h b/bed.h new file mode 100644 index 0000000..3d5a312 --- /dev/null +++ b/bed.h @@ -0,0 +1,65 @@ +/* The MIT License + + Copyright (c) 2014 Adrian Tan + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef BED_H +#define BED_H + +#include "htslib/kstring.h" +#include "utils.h" +#include "interval_tree.h" + +class BEDRecord : public Interval +{ + public: + std::string chrom; + int32_t start1, end1; + + BEDRecord(kstring_t *s) + { + std::vector fields; + split(fields, "\t", s->s); + + chrom = fields[0]; + str2int32(fields[3], start1); + str2int32(fields[4], end1); + }; + + BEDRecord(std::string& chrom, int32_t start1, int32_t end1) + { + this->chrom = chrom; + this->start1 = start1; + this->end1 = end1; + }; + + /** + * Prints this BED record to STDERR. + */ + void print() + { + std::cerr << this->chrom << ":" << this->start1 << "-" <end1 << "\n"; + }; + + private: +}; + +#endif \ No newline at end of file diff --git a/genome_interval.cpp b/genome_interval.cpp index c3dcee9..8281aa4 100644 --- a/genome_interval.cpp +++ b/genome_interval.cpp @@ -126,4 +126,12 @@ void GenomeInterval::to_string(kstring_t *interval) kputc('-', interval); kputw(end1, interval); } +}; + +/** + * Checks if this interval overlap. + */ +bool GenomeInterval::overlaps_with(std::string& chrom, int32_t start1, int32_t end1) +{ + return (seq==chrom && this->start1<=end1 && this->end1>=start1); }; \ No newline at end of file diff --git a/genome_interval.h b/genome_interval.h index eecf2af..0f8efc2 100644 --- a/genome_interval.h +++ b/genome_interval.h @@ -72,7 +72,7 @@ class GenomeInterval * Sets an interval. */ void set(std::string interval); - + /** * Converts genome interval into the entire chromosome. */ @@ -87,6 +87,11 @@ class GenomeInterval * Returns a string representation of this Genome Interval. */ void to_string(kstring_t *interval); + + /** + * Checks if this interval overlap. + */ + bool overlaps_with(std::string& chrom, int32_t start1, int32_t end1); }; #endif \ No newline at end of file diff --git a/overlap_region_matcher.cpp b/overlap_region_matcher.cpp new file mode 100644 index 0000000..a62d996 --- /dev/null +++ b/overlap_region_matcher.cpp @@ -0,0 +1,25 @@ +/* The MIT License + + Copyright (c) 2014 Adrian Tan + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "overlap_region_matcher.h" + diff --git a/overlap_region_matcher.h b/overlap_region_matcher.h new file mode 100644 index 0000000..33fcba9 --- /dev/null +++ b/overlap_region_matcher.h @@ -0,0 +1,102 @@ +/* The MIT License + + Copyright (c) 2014 Adrian Tan + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef OVERLAP_REGION_MATCHER_H +#define OVERLAP_REGION_MATCHER_H + +#include "htslib/vcf.h" +#include "htslib/kseq.h" +#include "program.h" +#include "hts_utils.h" +#include "tbx_ordered_reader.h" +#include "interval_tree.h" +#include + +class OverlapRegionMatcher +{ + public: + + /////////// + //options// + /////////// + std::string input_file; + + /////// + //i/o// + /////// + TBXOrderedReader *todr; + + + kstring_t s; + + GenomeInterval current_interval; + std::list buffer; + bool end_of_file; + + OverlapRegionMatcher(std::string& file) + { + todr = new TBXOrderedReader(file); + }; + + /** + * + */ + bool overlaps_with(std::string& chrom, int32_t start1, int32_t end1) + { + if (current_interval.seq!=chrom) + { + buffer.clear(); + GenomeInterval ginterval(chrom); + todr->jump_to_interval(ginterval); + while (todr->read(&s)) + { + + // buffer.add() + } + + } + if (current_interval.start1=start1) + { + return true; + } + else + { + return false; + } + + + }; + + /** + * Clear buffer. + */ + void clear_buffer() + { + } + + ~OverlapRegionMatcher() {}; + + private: +}; + +#endif diff --git a/tbx_ordered_reader.cpp b/tbx_ordered_reader.cpp index 1a05583..2b90d87 100644 --- a/tbx_ordered_reader.cpp +++ b/tbx_ordered_reader.cpp @@ -23,7 +23,38 @@ #include "tbx_ordered_reader.h" -TBXOrderedReader::TBXOrderedReader(std::string hts_file, std::vector& intervals) +/** + * Initialize files and intervals. + * + * @hts name of the input file + */ +TBXOrderedReader::TBXOrderedReader(std::string& hts_file) +{ + this->hts_file = hts_file; + interval_index = 0; + + hts = NULL; + tbx = NULL; + itr = NULL; + + hts = hts_open(hts_file.c_str(), "r"); + + index_loaded = false; + if ((tbx = tbx_index_load(hts_file.c_str()))) + { + index_loaded = true; + } + + random_access_enabled = index_loaded; +}; + +/** + * Initialize files and intervals. + * + * @hts name of the input file + * @intervals list of intervals, if empty, all records are selected. + */ +TBXOrderedReader::TBXOrderedReader(std::string& hts_file, std::vector& intervals) { this->hts_file = hts_file; this->intervals = intervals; diff --git a/tbx_ordered_reader.h b/tbx_ordered_reader.h index b6f7e28..eff1e68 100644 --- a/tbx_ordered_reader.h +++ b/tbx_ordered_reader.h @@ -84,14 +84,21 @@ class TBXOrderedReader //shared objects for string manipulation kstring_t s; - + + /** + * Initialize files and intervals. + * + * @hts name of the input file + */ + TBXOrderedReader(std::string& hts_file); + /** * Initialize files and intervals. * * @hts name of the input file * @intervals list of intervals, if empty, all records are selected. */ - TBXOrderedReader(std::string hts_file, std::vector& intervals); + TBXOrderedReader(std::string& hts_file, std::vector& intervals); /** * Jump to interval. Returns false if not successful.