From f296b8452c7b20dbffd69c02aef1a55291614c8e Mon Sep 17 00:00:00 2001 From: kcleal Date: Tue, 4 Mar 2025 15:18:14 +0000 Subject: [PATCH] #125 better calling of low support SVs --- dysgu/call_component.pyx | 64 ++++------- dysgu/cluster.pyx | 2 +- dysgu/coverage.pyx | 62 +++-------- dysgu/graph.pyx | 234 +++++++++++++++++++-------------------- dysgu/post_call.py | 9 +- pyproject.toml | 2 +- 6 files changed, 155 insertions(+), 218 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 0d5ebcf..40c259b 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -672,7 +672,6 @@ cdef make_single_call(sub_informative, insert_size, insert_stdev, insert_ppf, mi as2 = None ref_bases = 0 if to_assemble or len(spanning_alignments) > 0: - # echo('MAKE SINGLE CALL') if er.preciseA: as1 = consensus.base_assemble(u_reads, er.posA, 500) if as1 and (er.svtype != "TRA" or (as1['contig'] and (as1['contig'][0].islower() or as1['contig'][-1].islower()))): @@ -965,7 +964,6 @@ cdef linear_scan_clustering(spanning, bint hp_tag): def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative, generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag): - # echo("PROCESS SPANNING") cdef int min_found_support = 0 cdef str svtype, jointype cdef bint passed @@ -2055,33 +2053,10 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, seen.remove(u) if v in seen: seen.remove(v) - events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, sites_info, paired_end, hp_tag) - # finds reads that should be a single partition - # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) - # if len(u_reads) > 0 and len(v_reads) > 0: - # events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, - # sites_info, paired_end) - # if u_single: - # res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - # sites_info, paired_end, length_extend, divergence) - # if res: - # if isinstance(res, EventResult): - # events.append(res) - # else: - # events += res - # if v_single: - # res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - # sites_info, paired_end, length_extend, divergence) - # if res: - # if isinstance(res, EventResult): - # events.append(res) - # else: - # events += res - # Process any singles / unconnected blocks if seen: for part_key in seen: @@ -2101,23 +2076,28 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, events += res # Check for events within clustered nodes - if data.s_within: - for k, d in data.s_within: #.items(): - o_count = out_counts[k] - i_counts = len(d) - if i_counts > max_single_size: - continue - if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count: - rds = get_reads(bam, d, data.reads, data.n2n, 0, info) - if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): - continue - res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence, hp_tag) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res + # if data.s_within: + # for k, d in data.s_within: + # o_count = out_counts[k] + # i_counts = len(d) + # + # if i_counts > max_single_size: + # continue + # # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count: + # # if o_count > 0 or i_counts > 0: + # rds = get_reads(bam, d, data.reads, data.n2n, 0, info) + # if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): + # continue + # echo('Calling single 1') + # res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence, hp_tag) + # for r in res: + # echo(r.posA, r.posB, r.svlen) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res return events diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 6ce56ad..dd9b56c 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -273,7 +273,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N if args["pl"] == "pe": # reads with internal SVs can be detected at lower support lower_bound_support = min_support - 1 if min_support - 1 > 1 else 1 else: - lower_bound_support = 2 + lower_bound_support = min_support component_path = f"{tdir}/components.bin" cdef bytes cmp_file = component_path.encode("ascii") # write components to file if low-mem used diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 32ca3e5..1df5430 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -112,9 +112,6 @@ cdef class GenomeScanner: self.procs = read_threads self.buff_size = buffer_size self.current_bin = [] - self.current_cov = 0 - self.current_chrom = 0 - self.current_pos = 0 self.current_cov_array = None self.reads_dropped = 0 self.depth_d = {} @@ -152,10 +149,10 @@ cdef class GenomeScanner: if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0: # not primary, duplicate or unmapped? continue - self._add_to_bin_buffer(aln, tell) + yield aln, tell tell = 0 if self.no_tell else self.input_bam.tell() - while len(self.staged_reads) > 0: - yield self.staged_reads.popleft() + # yield aln, tell + # when 'call' command was run only. generate coverage track here else: self.cpp_cov_track.set_max_cov(self.max_cov) @@ -167,7 +164,7 @@ cdef class GenomeScanner: cigar_l = aln._delegate.core.n_cigar if cigar_l == 0 or aln.flag & 1284: - tell = 0 if self.no_tell else self.input_bam.tell() + # tell = 0 if self.no_tell else self.input_bam.tell() continue if aln.rname != self.current_tid: @@ -203,18 +200,14 @@ cdef class GenomeScanner: index_start += length if aln.mapq < mq_thresh or not good_read or not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos): - tell = 0 if self.no_tell else self.input_bam.tell() continue - - self._add_to_bin_buffer(aln, tell) + yield aln, tell tell = 0 if self.no_tell else self.input_bam.tell() - while len(self.staged_reads) > 0: - yield self.staged_reads.popleft() + # yield aln, tell + if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences: out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii") self.cpp_cov_track.write_track(out_path) - if len(self.current_bin) > 0: - yield self.current_bin else: logging.info("Searching --regions and collecting reads from mate-pair locations") @@ -291,16 +284,13 @@ cdef class GenomeScanner: continue if not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos): continue - self._add_to_bin_buffer(aln, tell) + yield aln, tell tell = 0 if self.no_tell else self.input_bam.tell() - while len(self.staged_reads) > 0: - yield self.staged_reads.popleft() + # yield aln, tell if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences: out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii") self.cpp_cov_track.write_track(out_path) - if len(self.current_bin) > 0: - yield self.current_bin def get_read_properties(self, int max_tlen, int insert_median, int insert_stdev, int read_len, ibam=None, find_divergence=False): @@ -352,7 +342,8 @@ cdef class GenomeScanner: continue tell = 0 if self.no_tell else self.input_bam.tell() if self.no_tell: - self._add_to_bin_buffer(a, tell) + self.staged_reads.append((a, tell)) + # self._add_to_bin_buffer(a, tell) if a.rname != self.current_tid: if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences: out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path.outpath, self.input_bam.get_reference_name(self.current_tid)).encode("ascii") @@ -447,7 +438,7 @@ cdef class GenomeScanner: # Read the rest of the genome, reads are sent in blocks cdef int total_reads = 0 for staged in self._get_reads(): - total_reads += len(staged) + total_reads += 1 # len(staged) yield staged if total_reads == 0: logging.critical("No reads found, finishing") @@ -464,33 +455,6 @@ cdef class GenomeScanner: elif self.no_tell: raise BufferError("Read buffer has overflowed, increase --buffer-size") - def _add_to_bin_buffer(self, AlignedSegment a, tell): - # Calculates coverage information on fly, drops high coverage regions, buffers reads - cdef int flag = a.flag - cdef uint32_t cigar_l = a._delegate.core.n_cigar - if flag & 1540 or cigar_l == 0: # or a.seq is None: - return - cdef int rname = a.rname - cdef int apos = a.pos - cdef int bin_pos = int(apos / 100) - cdef int ref_length - cdef str reference_name = "" - cdef int aend = a.reference_end - cdef float current_coverage - if self.current_chrom != rname: - self.current_chrom = rname - # in_roi = False - # if self.overlap_regions: - # in_roi = intersecter(self.overlap_regions, a.rname, apos, apos+1) - if rname == self.current_chrom and bin_pos == self.current_pos: - self.current_bin.append((a, tell)) - else: - if len(self.current_bin) != 0: - self.staged_reads.append(self.current_bin) - self.current_chrom = rname - self.current_pos = bin_pos - self.current_bin = [(a, tell)] - cdef float add_coverage(int start, int end, DTYPE_t[:] chrom_depth) nogil: cdef float fs = start / 100 @@ -622,7 +586,7 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov r.svlen = 1000000 r.mcov = max_depth - r.a_freq = r.a_freq / reads_10kb + r.a_freq = r.a_freq / (reads_10kb + 1e-5) r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2) new_events.append(r) return new_events diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index cd0c984..2f0353d 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -827,7 +827,6 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_ pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len) - # other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len) if not dereference(found_nodes_exact).empty(): for other_node in dereference(found_nodes_exact): if node_to_name.same_template(node_name, other_node): @@ -1289,144 +1288,135 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering cdef int n_checked_for_hp_tag = 0 if no_phase: n_checked_for_hp_tag = 10_001 - for chunk in genome_scanner.iter_genome(): - for r, tell in chunk: - if r.mapq < mapq_thresh: + + for r, tell in genome_scanner.iter_genome(): + if r.mapq < mapq_thresh: + continue + pos2 = -1 + event_pos = r.pos + added = False + clipped = 0 + events_to_add.clear() + cigar_l = r._delegate.core.n_cigar + cigar_p = bam_get_cigar(r._delegate) + if not paired_end: + if n_checked_for_hp_tag < 10_000: + n_checked_for_hp_tag += 1 + if r.has_tag("HP"): + hp_tag_found = True + n_checked_for_hp_tag = 10_001 + if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")): continue - pos2 = -1 - event_pos = r.pos - added = False - clipped = 0 - events_to_add.clear() - cigar_l = r._delegate.core.n_cigar - cigar_p = bam_get_cigar(r._delegate) - if not paired_end: - if n_checked_for_hp_tag < 10_000: - n_checked_for_hp_tag += 1 - if r.has_tag("HP"): - hp_tag_found = True - n_checked_for_hp_tag = 10_001 - if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")): + + if cigar_l > 1: + if r.has_tag("SA"): + # Set cigar-index to -1 means it is unset, will be determined during SA parsing + cigar_index = -1 + process_alignment(G, r, clip_l, max_dist, gettid, + overlap_regions, clustering_dist, pe_scope, + cigar_index, event_pos, paired_end, tell, genome_scanner, + template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT, + bad_clip_counter, mm_only, site_adder, 0, trust_ins_len) + added = True + for cigar_index in range(cigar_l): + cigar_value = cigar_p[cigar_index] + opp = cigar_value & 15 + length = cigar_value >> 4 + + if opp == 0 or opp == 7 or opp == 8: + if find_n_aligned_bases: + n_aligned_bases += length + event_pos += length continue - if cigar_l > 1: - if r.has_tag("SA"): - # Set cigar-index to -1 means it is unset, will be determined during SA parsing - cigar_index = -1 + if opp == 1: + if length >= min_sv_size: + pos2 = event_pos + length + events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION)) + added = True + elif opp == 2: + if length >= min_sv_size: + pos2 = event_pos + length + events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION)) + added = True + event_pos += length + elif opp == 3: + if length >= min_sv_size: + pos2 = event_pos + length + events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP)) + added = True + event_pos += length + else: + if opp != 4 and opp != 5: + event_pos += length + elif opp == 4 and length >= clip_l: + clipped = 1 + + if (paired_end and not added and contigs) or not paired_end: + # Whole alignment will be used, try infer position from soft-clip + cigar_index = -1 + pos2 = -1 + left_clip_size = 0 + right_clip_size = 0 + clip_sizes_hard(r, &left_clip_size, &right_clip_size) # soft and hard-clips + if r.flag & 8 and clipped: # paired by inference + # skip if both ends are clipped, usually means its a chunk of badly mapped sequence + # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ): + if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20): + # Mate is unmapped, insertion type. Only add if soft-clip is available process_alignment(G, r, clip_l, max_dist, gettid, overlap_regions, clustering_dist, pe_scope, cigar_index, event_pos, paired_end, tell, genome_scanner, - template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT, - bad_clip_counter, mm_only, site_adder, 0, trust_ins_len) - added = True - for cigar_index in range(cigar_l): - cigar_value = cigar_p[cigar_index] - opp = cigar_value & 15 - length = cigar_value >> 4 - - if opp == 0 or opp == 7 or opp == 8: - if find_n_aligned_bases: - n_aligned_bases += length - event_pos += length - continue - - if opp == 1: - if length >= min_sv_size: - # Insertion type - # if event_pos < 998000 or event_pos > 999000: - # continue - pos2 = event_pos + length - # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.INSERTION: - # continue - # else: - events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION)) - added = True - elif opp == 2: - if length >= min_sv_size: - pos2 = event_pos + length - # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.DELETION: - # continue - # else: - events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION)) - added = True - event_pos += length - elif opp == 3: - if length >= min_sv_size: - pos2 = event_pos + length - events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP)) - added = True - event_pos += length - else: - if opp != 4 and opp != 5: - event_pos += length - elif opp == 4 and length >= clip_l: - clipped = 1 - - if (paired_end and not added and contigs) or not paired_end: - # Whole alignment will be used, try infer position from soft-clip - cigar_index = -1 - pos2 = -1 - left_clip_size = 0 - right_clip_size = 0 - clip_sizes_hard(r, &left_clip_size, &right_clip_size) # soft and hard-clips - if r.flag & 8 and clipped: # paired by inference - # skip if both ends are clipped, usually means its a chunk of badly mapped sequence - # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ): - if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20): - # Mate is unmapped, insertion type. Only add if soft-clip is available + template_edges, node_to_name, + pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter, + mm_only, site_adder, 0, trust_ins_len) + else: + # Use whole read, could be normal or discordant + if not paired_end: + if max(left_clip_size, right_clip_size) > 250: + read_enum = ReadEnum_t.BREAKEND + if left_clip_size > right_clip_size: + event_pos = r.pos # else reference_end is used process_alignment(G, r, clip_l, max_dist, gettid, overlap_regions, clustering_dist, pe_scope, cigar_index, event_pos, paired_end, tell, genome_scanner, template_edges, node_to_name, - pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter, + pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, mm_only, site_adder, 0, trust_ins_len) - else: - # Use whole read, could be normal or discordant - if not paired_end: - if max(left_clip_size, right_clip_size) > 250: - read_enum = ReadEnum_t.BREAKEND - if left_clip_size > right_clip_size: - event_pos = r.pos # else reference_end is used - process_alignment(G, r, clip_l, max_dist, gettid, - overlap_regions, clustering_dist, pe_scope, - cigar_index, event_pos, paired_end, tell, genome_scanner, - template_edges, node_to_name, - pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, - mm_only, site_adder, 0, trust_ins_len) + else: + if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext: + if not clipped: + continue + read_enum = ReadEnum_t.BREAKEND else: - if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext: - if not clipped: - continue - read_enum = ReadEnum_t.BREAKEND - else: - read_enum = ReadEnum_t.DISCORDANT + read_enum = ReadEnum_t.DISCORDANT - if left_clip_size or right_clip_size: - if left_clip_size > right_clip_size: - event_pos = r.pos # else reference_end is used - process_alignment(G, r, clip_l, max_dist, gettid, - overlap_regions, clustering_dist, pe_scope, - cigar_index, event_pos, paired_end, tell, genome_scanner, - template_edges, node_to_name, - pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, - mm_only, site_adder, 0, trust_ins_len) - # process within-read events - if not events_to_add.empty(): - itr_events = events_to_add.begin() - while itr_events != events_to_add.end(): - v = dereference(itr_events) - if v.cigar_skip: - pos2 = v.pos2 - else: - pos2 = v.event_pos + v.length # fall back on original cigar event length + if left_clip_size or right_clip_size: + if left_clip_size > right_clip_size: + event_pos = r.pos # else reference_end is used process_alignment(G, r, clip_l, max_dist, gettid, overlap_regions, clustering_dist, pe_scope, - v.cigar_index, v.event_pos, paired_end, tell, genome_scanner, + cigar_index, event_pos, paired_end, tell, genome_scanner, template_edges, node_to_name, - v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter, - mm_only, site_adder, v.length, trust_ins_len) - preincrement(itr_events) + pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter, + mm_only, site_adder, 0, trust_ins_len) + # process within-read events + if not events_to_add.empty(): + itr_events = events_to_add.begin() + while itr_events != events_to_add.end(): + v = dereference(itr_events) + if v.cigar_skip: + pos2 = v.pos2 + else: + pos2 = v.event_pos + v.length # fall back on original cigar event length + process_alignment(G, r, clip_l, max_dist, gettid, + overlap_regions, clustering_dist, pe_scope, + v.cigar_index, v.event_pos, paired_end, tell, genome_scanner, + template_edges, node_to_name, + v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter, + mm_only, site_adder, v.length, trust_ins_len) + preincrement(itr_events) add_template_edges(G, template_edges) if site_adder: diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 945c7f8..4ae45cb 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -2,7 +2,7 @@ import numpy as np from dysgu.map_set_utils import echo from dysgu import re_map -from dysgu.io_funcs import reverse_complement, intersecter #, iitree +from dysgu.io_funcs import reverse_complement, intersecter import zlib import math import pickle @@ -17,8 +17,11 @@ from collections import defaultdict, Counter import os import warnings -from sklearn.exceptions import InconsistentVersionWarning -warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning) +try: + from sklearn.exceptions import InconsistentVersionWarning + warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning) +except ImportError: + pass def get_badclip_metric(events, bad_clip_counter, bam, regions): diff --git a/pyproject.toml b/pyproject.toml index 91c0451..13eaf63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta" [project] name = "dysgu" -version = "1.8.0" +version = "1.8.1" description = "Structural variant calling" authors = [ { name = "Kez Cleal", email = "clealk@cardiff.ac.uk" }