From 300c743e477591d6d24f988b3001adf999189718 Mon Sep 17 00:00:00 2001 From: kcleal Date: Thu, 14 Mar 2024 21:02:12 +0000 Subject: [PATCH] Code improvements for call_component, graph. Better filtering of somatic svs. Slightly improved runtime and memory. --- dysgu/call_component.pyx | 233 ++++++++++++--------- dysgu/cluster.pyx | 32 ++- dysgu/edlib/__init__.py | 3 + dysgu/filter_normals.py | 53 ++++- dysgu/find_reads.hpp | 52 ++++- dysgu/graph.pyx | 427 +++++++++++++++++++++++---------------- dysgu/main.py | 2 + dysgu/post_call.py | 8 +- dysgu/sv_category.pxd | 1 + dysgu/sv_category.pyx | 63 +++++- 10 files changed, 584 insertions(+), 290 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index e466ed2..a897f5e 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -1,6 +1,6 @@ #cython: language_level=3, boundscheck=True, c_string_type=unicode, c_string_encoding=utf8, infer_types=True from __future__ import absolute_import -from collections import Counter, defaultdict +from collections import Counter, defaultdict, namedtuple import logging import numpy as np cimport numpy as np @@ -684,6 +684,9 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l tmp = defaultdict(list) # group by template name small_tlen_outliers = 0 # for paired reads note any smaller than expected TLENs for cigar_info, align in rds: + # echo(cigar_info, cigar_info.read_enum) + # if not paired_end and cigar_info.read_enum < 2: # split read + # continue tmp[align.qname].append((cigar_info, align)) if align.flag & 1 and abs(align.tlen) < insert_ppf: small_tlen_outliers += 1 @@ -694,6 +697,7 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l l_align = list(alignments) if len(l_align) > 1: pair = guess_informative_pair(l_align) + # pair = informative_pair(alignments, alignments, paired_end=paired_end) if pair is not None: if len(pair) == 2: a, b = pair @@ -872,18 +876,35 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l cdef tuple informative_pair(u, v): pri_u = None pri_v = None - for i in u: + for i_info, i in u: ri_flag = i.flag & 64 if not i.flag & 2304: # Not pri, supplementary --> is primary - pri_u = i - for j in v: + pri_u = i_info, i + for j_info, j in v: if j.flag & 64 == ri_flag: # Same read # Same read, primary + supp, or supp + supp - return i, j + return (i_info, i), (j_info, j) if not j.flag & 2304: # Is primary - pri_v = j + pri_v = j_info, j if pri_u is not None and pri_v is not None: return pri_u, pri_v + return None + +# cdef tuple informative_pair(u, v, bint paired_end): +# # fetch either a split read or pair1 and pair2 +# for i in u: +# i_info = i[0] +# for j in v: +# j_info = j[0] +# if j_info.hash_name == i_info.hash_name: +# continue +# if not paired_end: +# if i_info.read_enum == 1 and j_info.read_enum == 1: +# return i, j +# elif i_info.read_enum == j_info.read_enum: +# return i, j +# return None + cdef tuple break_ops(positions, precise, int limit, float median_pos): @@ -1286,63 +1307,66 @@ cdef void assemble_partitioned_reads(EventResult_t er, u_reads, v_reads, int blo er.ref_bases = rbases -cdef call_from_reads(u_reads, v_reads, int insert_size, int insert_stdev, float insert_ppf, int min_support, +cdef call_from_reads(u_reads_info, v_reads_info, int insert_size, int insert_stdev, float insert_ppf, int min_support, int block_edge, int assemble, info, bint paired_end): grp_u = defaultdict(list) grp_v = defaultdict(list) - for r in u_reads: - grp_u[r.qname].append(r) - for r in v_reads: - grp_v[r.qname].append(r) + for uinfo, r in u_reads_info: + grp_u[r.qname].append((uinfo, r)) + for vinfo, r in v_reads_info: + grp_v[r.qname].append((vinfo, r)) informative = [] cdef AlignmentItem v_item, i cdef int left_clip_a, right_clip_a, left_clip_b, right_clip_b for qname in [k for k in grp_u if k in grp_v]: # Qname found on both sides u = grp_u[qname] v = grp_v[qname] - if len(u) == 1 and len(v) == 1: - pair = (u[0], v[0]) - else: - pair = informative_pair(u, v) - if pair: - a, b = pair - a_ct = a.cigartuples - b_ct = b.cigartuples - a_qstart, a_qend, b_qstart, b_qend, a_len, b_len = start_end_query_pair(a, b) - # Soft-clips for the chosen pair, plus template start of alignment - left_clip_a, right_clip_a, left_clip_b, right_clip_b = mask_soft_clips(a.flag, b.flag, a_ct, b_ct) - a_chrom, b_chrom = a.rname, b.rname - a_start, a_end = a.pos, a.reference_end - b_start, b_end = b.pos, b.reference_end - read_overlaps_mate = same_read_overlaps_mate(a_chrom, b_chrom, a_start, a_end, b_start, b_end, a, b) - v_item = AlignmentItem(a.rname, - b.rname, - int(not a.flag & 2304), # Is primary - int(not b.flag & 2304), - 1 if a.flag & 64 else 2, - 1 if b.flag & 64 else 2, - a.pos, a.reference_end, - b.pos, b.reference_end, - 3 if a.flag & 16 == 0 else 5, - 3 if b.flag & 16 == 0 else 5, - left_clip_a, right_clip_a, - left_clip_b, right_clip_b, - a_qstart, a_qend, b_qstart, b_qend, a_len, b_len, - read_overlaps_mate, - a, b - ) - if v_item.left_clipA and v_item.right_clipA: - if a_ct[0][1] >= a_ct[-1][1]: - v_item.right_clipA = 0 - else: - v_item.left_clipA = 0 - if v_item.left_clipB and v_item.right_clipB: - if b_ct[0][1] >= b_ct[-1][1]: - v_item.right_clipB = 0 - else: - v_item.left_clipB = 0 - classify_d(v_item) - informative.append(v_item) + pair = informative_pair(u, v) #, paired_end) + if not pair: + continue + # if len(u) == 1 and len(v) == 1: + # pair = (u[0], v[0]) + # else: + # pair = informative_pair(u, v, paired_end) + a_node_info, a, b_node_info, b, = pair[0][0], pair[0][1], pair[1][0], pair[1][1] + a_ct = a.cigartuples + b_ct = b.cigartuples + a_qstart, a_qend, b_qstart, b_qend, a_len, b_len = start_end_query_pair(a, b) + # Soft-clips for the chosen pair, plus template start of alignment + left_clip_a, right_clip_a, left_clip_b, right_clip_b = mask_soft_clips(a.flag, b.flag, a_ct, b_ct) + a_chrom, b_chrom = a.rname, b.rname + a_start, a_end = a.pos, a.reference_end + b_start, b_end = b.pos, b.reference_end + read_overlaps_mate = same_read_overlaps_mate(a_chrom, b_chrom, a_start, a_end, b_start, b_end, a, b) + v_item = AlignmentItem(a.rname, + b.rname, + int(not a.flag & 2304), # Is primary + int(not b.flag & 2304), + 1 if a.flag & 64 else 2, + 1 if b.flag & 64 else 2, + a.pos, a.reference_end, + b.pos, b.reference_end, + 3 if a.flag & 16 == 0 else 5, + 3 if b.flag & 16 == 0 else 5, + left_clip_a, right_clip_a, + left_clip_b, right_clip_b, + a_qstart, a_qend, b_qstart, b_qend, a_len, b_len, + read_overlaps_mate, + a, b, + a_node_info, b_node_info + ) + if v_item.left_clipA and v_item.right_clipA: + if a_ct[0][1] >= a_ct[-1][1]: + v_item.right_clipA = 0 + else: + v_item.left_clipA = 0 + if v_item.left_clipB and v_item.right_clipB: + if b_ct[0][1] >= b_ct[-1][1]: + v_item.right_clipB = 0 + else: + v_item.left_clipB = 0 + classify_d(v_item) + informative.append(v_item) if not informative: return [] @@ -1436,7 +1460,7 @@ cdef filter_single_partitions(u_reads, v_reads): cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int insert_stdev, float insert_ppf, - int min_support, int block_edge, int assemble, info, bint paired_end): + int min_support, int block_edge, int assemble, info, bint paired_end): spanning_alignments = [] u_reads = [] v_reads = [] @@ -1555,7 +1579,7 @@ cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int return [er] else: - results = call_from_reads(u_reads, v_reads, insert_size, insert_stdev, insert_ppf, min_support, block_edge, assemble, info, paired_end) + results = call_from_reads(u_reads_info, v_reads_info, insert_size, insert_stdev, insert_ppf, min_support, block_edge, assemble, info, paired_end) return results @@ -1571,6 +1595,14 @@ cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, site aligns = [] fpos = [] site_info = [] + + # Sometimes seeking into the bam file doesnt find the read. This can be addressed by instead + # using the index and looping through the file, but this adds significant overhead + # todo see if missing reads are an issue for long reads + # regions = [] + # hash_names = {} + # has_index = infile.has_index() + for int_node in nodes_info: if sites_index and int_node in sites_index: continue @@ -1605,6 +1637,23 @@ cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, site if add_to_buffer: buffered_reads[int_node] = a break + # else: + # if has_index: + # nn = n2n[int_node] + # hash_names[(nn.hash_name, nn.flag, nn.pos, nn.chrom)] = nn + # regions.append((infile.get_reference_name(nn.chrom), nn.pos, nn.pos + 200)) + # + # if regions: + # regions = merge_intervals(regions) + # for rgn in regions: + # for a in infile.fetch(*rgn): + # v = xxhasher(bam_get_qname(a._delegate), len(a.qname), 42) + # nm = (v, a.flag, a.pos, a.rname) + # if nm in hash_names: + # aligns.append((hash_names[nm], a)) + # + # if len(aligns) < len(nodes_info): + # echo(len(aligns), len(nodes_info)) return aligns @@ -1614,8 +1663,8 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, # Sometimes partitions are not linked, happens when there is not much support between partitions # Then need to decide whether to call from a single partition - n2n = data["n2n"] - seen = set(range(len(data['parts']))) + n2n = data.n2n + seen = set(range(len(data.parts))) out_counts = defaultdict(int) # The number of 'outward' links to other clusters cdef int buffered_reads = 0 cdef bint add_to_buffer = 1 @@ -1627,9 +1676,9 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, else: sites_info = [] # u and v are the part ids, d[0] and d[1] are the lists of nodes for those parts - for (u, v), d in data["s_between"].items(): - rd_u = get_reads(bam, d[0], data["reads"], data["n2n"], add_to_buffer, info) # [(Nodeinfo, alignment)..] - rd_v = get_reads(bam, d[1], data["reads"], data["n2n"], add_to_buffer, info) + for (u, v), d in data.s_between: #.items(): + rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info) # [(Nodeinfo, alignment)..] + rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info) total_reads = len(rd_u) + len(rd_v) buffered_reads += total_reads if add_to_buffer and buffered_reads > 50000: @@ -1643,36 +1692,40 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, if v in seen: seen.remove(v) + events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, + assemble_contigs, + sites_info, paired_end) + # finds reads that should be a single partition - u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) - if len(u_reads) > 0 and len(v_reads) > 0: - events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, - sites_info, paired_end) - if u_single: - res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res - if v_single: - res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res + # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) + # if len(u_reads) > 0 and len(v_reads) > 0: + # events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, + # sites_info, paired_end) + # if u_single: + # res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res + # if v_single: + # res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res # Process any singles / unconnected blocks if seen: for part_key in seen: - d = data["parts"][part_key] + d = data.parts[part_key] lb = lower_bound_support if len(sites_info) == 0 else 1 if max_single_size > len(d) >= lb: # Call single block, only collect local reads to the block - rds = get_reads(bam, d, data["reads"], data["n2n"], 0, info) + rds = get_reads(bam, d, data.reads, data.n2n, 0, info) if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): continue res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, @@ -1684,13 +1737,13 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, events += res # Check for events within clustered nodes - for k, d in data["s_within"].items(): + for k, d in data.s_within: #.items(): o_count = out_counts[k] i_counts = len(d) if i_counts > max_single_size: continue if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count: - rds = get_reads(bam, d, data["reads"], data["n2n"], 0, info) + rds = get_reads(bam, d, data.reads, data.n2n, 0, info) if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): continue res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, @@ -1705,10 +1758,10 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_stdev, insert_ppf, min_support, lower_bound_support, assemble_contigs, max_single_size, sites_index, bint paired_end, int length_extend, float divergence): - n_parts = len(data["parts"]) + n_parts = len(data.parts) events = [] - if "info" in data: - info = data["info"] + if data.info: + info = data.info else: info = None # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge @@ -1717,9 +1770,9 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std events += multi(data, bam, insert_size, insert_stdev, insert_ppf, clip_length, min_support, lower_bound_support, assemble_contigs, max_single_size, info, paired_end, length_extend, divergence) elif n_parts == 0: - if len(data["n2n"]) > max_single_size: + if len(data.n2n) > max_single_size: return [] - rds = get_reads(bam, data["n2n"].keys(), data["reads"], data["n2n"], 0, info) + rds = get_reads(bam, data.n2n.keys(), data.reads, data.n2n, 0, info) if info: sites_info = list(info.values()) else: diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 33bc24a..8c7f300 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -150,10 +150,14 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re event_iter = compare_subset(potential, max_dist) # Use iitree, generate overlap tree and perform intersections seen = set([]) pad = 100 + # max_edges = 8 disjoint_nodes = set([]) # if a component has more than one disjoint nodes it needs to be broken apart + # edge_counts = defaultdict(int) for ei, ej, idx, jdx in event_iter: i_id = ei.event_id j_id = ej.event_id + # if edge_counts[i_id] > max_edges and edge_counts[j_id] > max_edges: + # continue if not same_sample: if ei.sample == ej.sample: continue @@ -183,6 +187,8 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re # Force merging of translocations that have similar loci if not intra: G.add_edge(i_id, j_id, loci_same=loci_same) #, w=0) + # edge_counts[i_id] += 1 + # edge_counts[j_id] += 1 continue one_is_imprecise = False @@ -265,6 +271,8 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re if ml > 0: if l_ratio > 0.5 or (one_is_imprecise and l_ratio > 0.3): G.add_edge(i_id, j_id, loci_same=loci_same) + # edge_counts[i_id] += 1 + # edge_counts[j_id] += 1 else: v = None if ci_alt and cj_alt: @@ -285,26 +293,42 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re if ei.remap_score == 0 or ej.remap_score == 0: if (v[0][0].islower() and v[1][-1].islower()) or (v[0][-1].islower() and v[1][0].islower()): G.add_edge(i_id, j_id, loci_same=loci_same) + # edge_counts[i_id] += 1; edge_counts[j_id] += 1 continue if assembler.check_contig_match(v[0], v[1], return_int=True): G.add_edge(i_id, j_id, loci_same=True) + # edge_counts[i_id] += 1; edge_counts[j_id] += 1 # see if alt sequence can be found in other contig else: + # echo("checking alts") + # if ci_alt and cj_alt: + # echo("yes0", ci_alt, cj_alt) + # if assembler.check_contig_match(ci_alt, cj_alt, return_int=True): + # G.add_edge(i_id, j_id, loci_same=True) + # edge_counts[i_id] += 1; edge_counts[j_id] += 1 + # continue if ci_alt and cj: if assembler.check_contig_match(ci_alt, cj, return_int=True): G.add_edge(i_id, j_id, loci_same=True) + # edge_counts[i_id] += 1; edge_counts[j_id] += 1 continue if ci_alt and cj2: if assembler.check_contig_match(ci_alt, cj2, return_int=True): G.add_edge(i_id, j_id, loci_same=True) + # edge_counts[i_id] += 1; + # edge_counts[j_id] += 1 continue if cj_alt and ci: if assembler.check_contig_match(cj_alt, ci, return_int=True): G.add_edge(i_id, j_id, loci_same=True) + # edge_counts[i_id] += 1; + # edge_counts[j_id] += 1 continue if cj_alt and ci2: if assembler.check_contig_match(cj_alt, ci2, return_int=True): G.add_edge(i_id, j_id, loci_same=True) + # edge_counts[i_id] += 1; + # edge_counts[j_id] += 1 continue return G, disjoint_nodes @@ -811,7 +835,9 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N trust_ins_len=args["trust_ins_len"] == "True", low_mem=low_mem, temp_dir=tdir, - find_n_aligned_bases=find_n_aligned_bases) + find_n_aligned_bases=find_n_aligned_bases, + position_distance_thresh=args['sd'], + ) sites_index = None if sites_adder: sites_index = sites_adder.sites_index @@ -937,7 +963,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N pickle.dump(res, completed_file) else: j_submitted, w_idx = heapq.heappop(minhq) - heapq.heappush(minhq, (j_submitted + len(res["n2n"]), w_idx)) + heapq.heappush(minhq, (j_submitted + len(res.n2n), w_idx)) msg_queues[w_idx][1].send(res) else: # most partitions processed here, dict returned, or None @@ -969,7 +995,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N pickle.dump(res, completed_file) else: j_submitted, w_idx = heapq.heappop(minhq) - heapq.heappush(minhq, (j_submitted + len(res["n2n"]), w_idx)) + heapq.heappush(minhq, (j_submitted + len(res.n2n), w_idx)) msg_queues[w_idx][1].send(res) if completed_file is not None: diff --git a/dysgu/edlib/__init__.py b/dysgu/edlib/__init__.py index e69de29..5dd9e64 100644 --- a/dysgu/edlib/__init__.py +++ b/dysgu/edlib/__init__.py @@ -0,0 +1,3 @@ + +from .edlib import (align) +__all__ = ['align'] \ No newline at end of file diff --git a/dysgu/filter_normals.py b/dysgu/filter_normals.py index 3b155d3..bafbf30 100644 --- a/dysgu/filter_normals.py +++ b/dysgu/filter_normals.py @@ -1,5 +1,7 @@ import random from sys import stderr, argv + +import numpy as np import pysam from pysam import CSOFT_CLIP, CHARD_CLIP, CDEL, CINS, CDIFF, CMATCH import time @@ -14,6 +16,7 @@ import zlib from dysgu.edlib import edlib import glob +from dysgu.map_set_utils import echo random.seed(42) @@ -398,6 +401,7 @@ def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, min_length = svlen * 0.2 # 0.25 max_distance = svlen * 10 ins_like = svtype == "DUP" or svtype == "INV" + debug = False for k, l in ct: if k == CHARD_CLIP or k == CSOFT_CLIP: continue @@ -408,17 +412,22 @@ def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, pos += l continue elif not is_insertion and k == CINS: - if l > 250 and ins_like: + span_similarity = min(l, svlen) / max(l, svlen) + if l > 250 and ins_like and span_similarity > 0.85 and abs(posA - pos) < max(l, svlen): + if debug: echo("ret0", svlen, f"{posA}-{posA + svlen}", f"{pos}-{pos + l}", (k, l)) return True elif svlen * 2 < l and (abs(posA - pos) < max_distance or (abs(posB - pos + l) < max_distance)): + if debug: echo("ret1") return True continue end = pos + l if l > min_length and abs(posA - pos) < max_distance: if is_insertion: if span_position_distance(posA, posA + svlen, pos, end, pos_threshold, span_threshold): + if debug: echo("ret2", f"{posA}-{posA + svlen}", f"{pos}-{end}") return True elif span_position_distance(posA, posB, pos, end, pos_threshold, span_threshold): + if debug: echo("ret3") return True if k == CDEL: pos += l @@ -640,7 +649,12 @@ def matching_soft_clips(r, reads_with_nearby_soft_clips, pe): def has_low_support(r, sample, support_fraction): - cov = r.samples[sample]['COV'] + d = r.samples[sample] + m = max(d['ICN'], d['OCN']) + if m == 0: + cov = d['COV'] + else: + cov = d['COV'] * (min(d['ICN'], d['OCN']) / m) min_support = round(1.5 + support_fraction * cov) if r.info['SU'] < min_support: return True @@ -649,7 +663,13 @@ def has_low_support(r, sample, support_fraction): def has_low_WR_support(r, sample, support_fraction, n_overlapping, n_mapq0): sf = support_fraction / 2 - cov = r.samples[sample]['COV'] + # cov = r.samples[sample]['COV'] + d = r.samples[sample] + m = max(d['ICN'], d['OCN']) + if m == 0: + cov = d['COV'] + else: + cov = d['COV'] * (min(d['ICN'], d['OCN']) / m) if n_mapq0 / n_overlapping > 0.3: cov = max(cov, n_overlapping) min_support = min(4, round(1.5 + sf * cov)) @@ -681,6 +701,7 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, has_contig = 'CONTIG' in r.info or 'CONTIG2' in r.info or r.alts[0][0] != "<" is_paired_end = False good_step = good_step_translocation(r, sample) + debug = False #True for is_paired_end, aln in iterate_bams_single_region(bams, chromA, posA, pad, bam_is_paired_end): if is_paired_end: distance = 50 @@ -703,10 +724,12 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, if aln.flag & 4 or not has_clip(aln): continue if matching_supplementary(aln, infile, posA, posB): + if debug: echo("tra0") if keep_all: r.filter.add("normal") return False if not good_step and matching_ins_translocation(posA, aln): + if debug: echo("tra1") if keep_all: r.filter.add("normal") return False @@ -715,6 +738,7 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, if any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=50): nearby_soft_clips += 1 elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250): + if debug: echo("tra2") if keep_all: r.filter.add("lowSupport") return False @@ -741,10 +765,12 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, if aln.flag & 4 or not has_clip(aln): continue if matching_supplementary(aln, infile, posB, posA): + if debug: echo("tra3") if keep_all: r.filter.add("normal") return False if not good_step and matching_ins_translocation(posB, aln): + if debug: echo("tra4") if keep_all: r.filter.add("normal") return False @@ -753,15 +779,18 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, if any_nearby_soft_clip(r.pos, posB, aln, ct, "TRA", 30, clip_length=50): nearby_soft_clips += 1 elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250): + if debug: echo("tra5") if keep_all: r.filter.add("lowSupport") return False if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): + if debug: echo("tra6") if keep_all: r.filter.add("lowSupport") return False if cached and matching_soft_clips(r, cached, is_paired_end): + if debug: echo("tra7") if keep_all: r.filter.add("normal") return False @@ -780,6 +809,8 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa is_paired_end = False n_mapq_0 = 0 n_overlpapping = 0 + # debug = False if r.rid != "229738" else True + debug = False #False for is_paired_end, aln in iterate_bams(bams, r.chrom, posA, r.chrom, posB, pad, bam_is_paired_end): if is_paired_end: a_posA = min(aln.pos, aln.pnext) @@ -788,6 +819,7 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa continue if aln.rname != aln.rnext: if matching_supplementary(aln, infile, posA, posB): + if debug: echo("1") if keep_all: r.filter.add("normal") return False @@ -796,15 +828,18 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa if abs(a_posA - posA) > pad or abs(a_posB - posB) > pad: continue if not is_insertion and not aln.flag & 10: # proper pair, mate unmapped - if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.5): + if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.7): + if debug: echo("2") if keep_all: r.filter.add("normal") return False if not is_insertion and matching_supplementary(aln, infile, posA, posB): + if debug: echo("3") if keep_all: r.filter.add("normal") return False if svlen < 80 and matching_gap(posA, posB, aln, svtype, is_insertion, svlen): + if debug: echo("4") if keep_all: r.filter.add("normal") return False @@ -812,10 +847,12 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa else: if matching_gap(posA, posB, aln, svtype, is_insertion, svlen, pos_threshold=30, span_threshold=0.7): + if debug: echo("5", svlen, aln.qname) if keep_all: r.filter.add("normal") return False if matching_supplementary(aln, infile, posA, posB): + if debug: echo("6") if keep_all: r.filter.add("normal") return False @@ -828,21 +865,25 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa covered += 1 if any_nearby_soft_clip(posA, posB, aln, ct, svtype, 30, clip_length=50): nearby_soft_clips += 1 - cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50) + # cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50) if not is_paired_end and not covered: + if debug: echo("7") if keep_all: r.filter.add("lowSupport") return False if not is_paired_end and has_low_WR_support(r, sample, support_fraction, n_overlpapping, n_mapq_0): + if debug: echo("8") if keep_all: r.filter.add("lowSupport") return False if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): + if debug: echo("9") if keep_all: r.filter.add("lowSupport") return False if cached and matching_soft_clips(r, cached, is_paired_end): + if debug: echo("10") if keep_all: r.filter.add("normal") return False @@ -880,7 +921,7 @@ def run_filtering(args): filter_results = defaultdict(int) written = 0 for idx, r in enumerate(vcf): - # if r.id != "25979": + # if r.id != "165324": # continue old_filter_value = list(r.filter.keys()) r.filter.clear() diff --git a/dysgu/find_reads.hpp b/dysgu/find_reads.hpp index 19e4aa7..7138af1 100644 --- a/dysgu/find_reads.hpp +++ b/dysgu/find_reads.hpp @@ -14,6 +14,22 @@ #include +void remove_extraneous_tags(bam1_t* src, int remove_extra_tags) { + // needed NM, AS, SA, XS, but remove some of the large offenders to save a bit of disk space + int i=0; + for (auto &t : {"ML", "MM", "MD"}) { + uint8_t* data = bam_aux_get(src, t); + if (data != nullptr) { + bam_aux_del(src, data); + } + i += 1; + if (i >= remove_extra_tags) { + break; + } + } +} + + class CoverageTrack { public: @@ -107,7 +123,7 @@ int process_alignment(int& current_tid, std::deque> robin_hood::unordered_set& read_names, CoverageTrack& cov_track, uint64_t& total, const int n_chromosomes, const int mapq_thresh, const int check_clips, const int min_within_size, sam_hdr_t **samHdr, htsFile **f_out, - std::string& temp_folder, const bool write_all, long *n_aligned_bases) { + std::string& temp_folder, const bool write_all, long *n_aligned_bases, int remove_extra_tags) { int result; bam1_t* aln; @@ -128,6 +144,9 @@ int process_alignment(int& current_tid, std::deque> // Check if write queue is full if (write_queue.size() > max_write_queue) { for (const auto& val: write_queue) { + if (remove_extra_tags) { + remove_extraneous_tags(val, remove_extra_tags); + } result = sam_write1(*f_out, *samHdr, val); if (result < 0) { return -1; } total += 1; @@ -317,7 +336,7 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, std::string region_string = region; long n_aligned_bases = 0; - + int remove_extra_tags = 0; // iterate through regions if (region_string != ".,") { @@ -342,12 +361,21 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, // Read alignment into the back of scope queue while ( sam_itr_next(fp_in, iter, scope.back().second) >= 0 ) { - + if (total == 0) { + for (auto &t : {"ML", "MM", "MD"}) { + uint8_t* data = bam_aux_get(scope.back().second, t); + if (data != nullptr) { + remove_extra_tags += 1; + } else { + break; + } + } + } int success = process_alignment(current_tid, scope, write_queue, max_scope, max_write_queue, clip_length, read_names, cov_track, total, n_chromosomes, mapq_thresh, check_clips, min_within_size, - &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases); + &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases, remove_extra_tags); if (success < 0) { std::cerr << "Failed to process input alignment. Stopping" << region << std::endl; break; @@ -357,12 +385,21 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, } else { // iterate whole alignment file (no index needed) while (sam_read1(fp_in, samHdr, scope.back().second) >= 0) { - + if (total == 0) { + for (auto &t : {"ML", "MM", "MD"}) { + uint8_t* data = bam_aux_get(scope.back().second, t); + if (data != nullptr) { + remove_extra_tags += 1; + } else { + break; + } + } + } int success = process_alignment(current_tid, scope, write_queue, max_scope, max_write_queue, clip_length, read_names, cov_track, total, n_chromosomes, mapq_thresh, check_clips, min_within_size, - &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases); + &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases, remove_extra_tags); if (success < 0) { std::cerr << "Failed to process input alignment. Stopping" << region << std::endl; break; @@ -384,6 +421,9 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, } for (const auto& val: write_queue) { + if (remove_extra_tags) { + remove_extraneous_tags(val, remove_extra_tags); + } result = sam_write1(f_out, samHdr, val); if (result < 0) { return -1; } total += 1; diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index db0f274..74d3f5a 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -7,7 +7,6 @@ import sortedcontainers import cython from cpython cimport array import array -import re import logging from dysgu.map_set_utils cimport unordered_map as robin_map, Py_SimpleGraph from dysgu.map_set_utils cimport multimap as cpp_map @@ -23,7 +22,7 @@ from libcpp.deque cimport deque as cpp_deque from libcpp.vector cimport vector from libcpp.pair cimport pair as cpp_pair from libcpp.unordered_map cimport unordered_map -from libc.stdint cimport uint16_t, uint32_t, int32_t, uint64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t, uint64_t from libc.stdlib cimport abs as c_abs from cython.operator import dereference, postincrement, postdecrement, preincrement, predecrement from pysam.libcalignedsegment cimport AlignedSegment @@ -286,14 +285,16 @@ cdef class PairedEndScoper: cdef cpp_map[int, LocalVal] loci # Track the local breaks and mapping locations cdef vector[cpp_map[int, LocalVal]] chrom_scope # Track the mate-pair breaks and locations cdef float norm - cdef float thresh + cdef float thresh # spd + cdef float position_distance_thresh cdef bint paired_end - def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end): + def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end, position_distance_thresh): self.clst_dist = clst_dist self.max_dist = max_dist self.local_chrom = -1 self.norm = norm self.thresh = thresh + self.position_distance_thresh = position_distance_thresh self.paired_end = paired_end cdef cpp_map[int, LocalVal] scope for n in range(n_references + 1): # Add one for special 'insertion chromosome' @@ -347,28 +348,26 @@ cdef class PairedEndScoper: local_it = forward_scope.lower_bound(pos2) steps = 0 if local_it != forward_scope.end(): - while steps < 20: #6: + while local_it != forward_scope.end() and steps < 20: vitem = dereference(local_it) + preincrement(local_it) + steps += 1 + if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): - steps += 1 continue node_name2 = vitem.second.node_name if node_name2 != node_name: # Can happen due to within-read events - # if node_name == 77: - # echo("-->", node_name, node_name2, current_chrom == chrom2, current_pos, pos2, pos2 - current_pos, vitem.second.pos2 - vitem.first, - # is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2), - # ) - # echo(read_enum, vitem.second.read_enum, read_enum == INSERTION, vitem.second.read_enum == DELETION) if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): sep = c_abs(vitem.first - pos2) + if sep >= self.max_dist: + break sep2 = c_abs(vitem.second.pos2 - current_pos) - if sep < self.max_dist and sep2 < self.max_dist: - if sep < 35: + if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist: + if sep < 150:#35: if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: max_span = max(length_from_cigar, vitem.second.length_from_cigar) span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span - # echo("hi", node_name2, (length_from_cigar, vitem.second.length_from_cigar), span_distance, self.thresh) - if span_distance < 0.8: + if span_distance < self.position_distance_thresh: found_exact.push_back(node_name2) else: found_exact.push_back(node_name2) @@ -379,54 +378,43 @@ cdef class PairedEndScoper: elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): found2.push_back(node_name2) - if sep >= self.max_dist: - break - preincrement(local_it) - steps += 1 - if local_it == forward_scope.end(): - break + if not found_exact.empty(): + return found_exact - if found_exact.empty(): - local_it = forward_scope.lower_bound(pos2) - vitem = dereference(local_it) - if local_it != forward_scope.begin(): - predecrement(local_it) # Move back one before staring search, otherwise same value is processed twice - steps = 0 - while steps < 20: # 6: - vitem = dereference(local_it) - if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): - steps += 1 - continue - node_name2 = vitem.second.node_name - if node_name2 != node_name: - # if node_name == 1: - # echo("2", node_name, node_name2, read_enum) #, is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2), span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar)) - # if node_name == 77: - # echo('r', current_pos, pos2, vitem.first, vitem.second.pos2, span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len)) - if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): - sep = c_abs(vitem.first - pos2) - sep2 = c_abs(vitem.second.pos2 - current_pos) - if sep < self.max_dist and vitem.second.chrom2 == chrom2 and \ - sep2 < self.max_dist: - if sep < 35: - if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: - max_span = max(length_from_cigar, vitem.second.length_from_cigar) - span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span - # echo("hi2", node_name2, (length_from_cigar, vitem.second.length_from_cigar), span_distance, self.thresh) - if span_distance < 0.8: - found_exact.push_back(node_name2) - else: + local_it = forward_scope.lower_bound(pos2) + vitem = dereference(local_it) + if local_it != forward_scope.begin(): + predecrement(local_it) # Move back one before staring search, otherwise same value is processed twice + steps = 0 + while local_it != forward_scope.begin() and steps < 20: + vitem = dereference(local_it) + predecrement(local_it) + steps += 1 + if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): + continue + node_name2 = vitem.second.node_name + if node_name2 != node_name: + if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): + sep = c_abs(vitem.first - pos2) + if sep >= self.max_dist: + break + sep2 = c_abs(vitem.second.pos2 - current_pos) + if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist: + if sep < 150:#35: + if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: + max_span = max(length_from_cigar, vitem.second.length_from_cigar) + span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span + if span_distance < self.position_distance_thresh: found_exact.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) + else: + found_exact.push_back(node_name2) elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): found2.push_back(node_name2) elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): found2.push_back(node_name2) - if local_it == forward_scope.begin() or sep >= self.max_dist: - break - predecrement(local_it) - steps += 1 + elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): + found2.push_back(node_name2) + if not found_exact.empty(): return found_exact else: @@ -458,72 +446,94 @@ cdef class PairedEndScoper: forward_scope.insert(pp) +cdef struct TemplateNode: + int query_start + int node + int flag + + cdef class TemplateEdges: - cdef public unordered_map[string, vector[int]] templates_s + cdef public unordered_map[string, vector[TemplateNode]] templates_s # query_start, node, flag def __init__(self): pass cdef void add(self, str template_name, int flag, int node, int query_start): - cdef vector[int] val cdef bytes key = bytes(template_name, encoding="utf8") - val.push_back(query_start) - val.push_back(node) - val.push_back(flag) - self.templates_s[key].insert(self.templates_s[key].end(), val.begin(), val.end()) + self.templates_s[key].resize(self.templates_s[key].size() + 1) + cdef TemplateNode *t = &self.templates_s[key].back() + t.query_start = query_start + t.flag = t.flag + t.node = node cdef void add_template_edges(Py_SimpleGraph G, TemplateEdges template_edges): # this function joins up template reads (read 1, read 2, plus any supplementary) - cdef int ii, u_start, v_start, u, v, uflag, vflag - cdef unordered_map[string, vector[int]].iterator it = template_edges.templates_s.begin() - cdef vector[int] arr + cdef int i, j, u_start, v_start, u, v, uflag, vflag, primary1, primary2, n1, n2 + cdef unordered_map[string, vector[TemplateNode]].iterator it = template_edges.templates_s.begin() + cdef vector[TemplateNode] arr + cdef TemplateNode *t while it != template_edges.templates_s.end(): - arr = dereference(it).second # Array values are query start, node-name, flag + arr = dereference(it).second postincrement(it) read1_aligns = [] read2_aligns = [] - for ii in range(0, arr.size(), 3): - if arr[ii + 2] & 64: # first in pair - read1_aligns.append(arr[ii:ii + 3]) + for i in range(0, arr.size()): + t = &arr[i] + if t.flag & 64: # first in pair + read1_aligns.append((t.query_start, t.node, t.flag)) else: - read2_aligns.append(arr[ii:ii + 3]) - primary1 = None - primary2 = None - if len(read1_aligns) > 0: - if len(read1_aligns) == 1: + read2_aligns.append((t.query_start, t.node, t.flag)) + primary1 = -1 + primary2 = -1 + n1 = len(read1_aligns) + n2 = len(read2_aligns) + if n1 > 0: + if n1 == 1: if not read1_aligns[0][2] & 2304: # Is primary primary1 = read1_aligns[0][1] else: - if len(read1_aligns) > 2: - read1_aligns = sorted(read1_aligns) + if n1 > 2: + read1_aligns.sort() # Add edge between alignments that are neighbors on the query sequence, or between primary alignments - for ii in range(len(read1_aligns) - 1): - u_start, u, uflag = read1_aligns[ii] + for i in range(n1 - 1): + u_start, u, uflag = read1_aligns[i] if not uflag & 2304: primary1 = u - v_start, v, vflag = read1_aligns[ii + 1] + v_start, v, vflag = read1_aligns[i + 1] if not G.hasEdge(u, v): G.addEdge(u, v, w=1) - if primary1 is None: + if primary1 == -1: if not read1_aligns[-1][2] & 2304: primary1 = read1_aligns[-1][1] - if len(read2_aligns) > 0: - if len(read2_aligns) == 1: + if n2 > 0: + if n2 == 1: if not read2_aligns[0][2] & 2304: primary2 = read2_aligns[0][1] else: - if len(read2_aligns) > 2: - read2_aligns = sorted(read2_aligns) - for ii in range(len(read2_aligns) - 1): - u_start, u, uflag = read2_aligns[ii] + if n2 > 2: + read2_aligns.sort() + for i in range(n2 - 1): + u_start, u, uflag = read2_aligns[i] if not uflag & 2304: primary2 = u - v_start, v, vflag = read2_aligns[ii + 1] + v_start, v, vflag = read2_aligns[i + 1] if not G.hasEdge(u, v): G.addEdge(u, v, w=1) - if primary2 is None: + # Tried this, only edges between ajacent query aligns, not duplicate ones. but hurt performance! + # j = i + 1 + # while j < n2: + # v_start, v, vflag = read2_aligns[j] + # if v_start != u_start: + # if not G.hasEdge(u, v): + # G.addEdge(u, v, w=1) + # if j < n2 - 1 and read2_aligns[j + 1][0] == v_start: + # j += 1 + # continue + # break + # j += 1 + if primary2 == -1: if not read2_aligns[-1][2] & 2304: primary2 = read2_aligns[-1][1] - if primary1 is not None and primary2 is not None: + if primary1 >= 0 and primary2 >= 0: if not G.hasEdge(primary1, primary2): G.addEdge(primary1, primary2, w=1) @@ -545,35 +555,48 @@ cdef class NodeName: self.tell = t self.cigar_index = cigar_index self.event_pos = event_pos - def as_tuple(self): - return self.hash_name, self.flag, self.pos, self.chrom, self.tell, self.cigar_index, self.event_pos + def as_dict(self): + return {"hash_name": self.hash_name, + "flag": self.flag, + "chrom": self.chrom, + "pos": self.pos, + "tell": self.tell, + "cigar_index": self.cigar_index, + "event_pos": self.event_pos, + "read_enum": self.read_enum, + } + + +cdef struct NodeNameItem: + uint64_t hash_val, tell + uint32_t pos, event_pos + int32_t cigar_index + uint16_t flag, chrom + cdef class NodeToName: # Index these vectors to get the unique 'template_name' # node names have the form (hash qname, flag, pos, chrom, tell, cigar index, event pos) - cdef vector[uint64_t] h - cdef vector[uint16_t] f - cdef vector[uint32_t] p - cdef vector[uint16_t] c - cdef vector[uint64_t] t - cdef vector[int32_t] cigar_index - cdef vector[uint32_t] event_pos + cdef vector[NodeNameItem] stored_nodes def __cinit__(self): pass - cdef void append(self, long a, int b, int c, int d, long e, int f, int g) nogil: - self.h.push_back(a) - self.f.push_back(b) - self.p.push_back(c) - self.c.push_back(d) - if e == -1: # means to look in read buffer instead - e = 0 - self.t.push_back(e) - self.cigar_index.push_back(f) - self.event_pos.push_back(g) + cdef void append(self, long hash_val, int flag, int pos, int chrom, long tell, int cigar_index, int event_pos): + self.stored_nodes.resize(self.stored_nodes.size() + 1); + cdef NodeNameItem *nd = &self.stored_nodes.back(); + nd.hash_val = hash_val + nd.flag = flag + nd.pos = pos + nd.chrom = chrom + if tell == -1: # means to look in read buffer instead + tell = 0 + nd.tell = tell + nd.cigar_index = cigar_index + nd.event_pos = event_pos def __getitem__(self, idx): - return NodeName(self.h[idx], self.f[idx], self.p[idx], self.c[idx], self.t[idx], self.cigar_index[idx], self.event_pos[idx]) + cdef NodeNameItem *nd = &self.stored_nodes[idx] + return NodeName(nd.hash_val, nd.flag, nd.pos, nd.chrom, nd.tell, nd.cigar_index, nd.event_pos) cdef bint same_template(self, int query_node, int target_node): - if self.h[query_node] == self.h[target_node]: + if self.stored_nodes[query_node].hash_val == self.stored_nodes[target_node].hash_val: return True cdef get_query_pos_from_cigarstring(cigar, pos): @@ -599,12 +622,38 @@ cdef get_query_pos_from_cigarstring(cigar, pos): return start, end, pos, ref_end -def get_query_pos_from_cigartuples(r): +cdef void parse_cigar(str cigar, int &start, int &end, int &ref_end): + cdef: + int in_lead = 1 + int num = 0 # To accumulate the number as we parse through the string + char op + cdef bytes t = cigar.encode('utf8') + cdef char *c_cigar = t # Convert Python string to char* + while dereference(c_cigar): + if b'0' <= dereference(c_cigar) <= b'9': + # Convert digit char to int and accumulate + num = num * 10 + (dereference(c_cigar) - 48 ) # ord(b'0') = 48 + else: + op = dereference(c_cigar) + if in_lead and (op == b'S' or op == b'H'): + start += num + end += num + elif op == b'D': + ref_end += num + elif op == b'I': + end += num + elif op == b'M' or op == b'=' or op == b'X': + end += num + ref_end += num + num = 0 + in_lead = 0 + c_cigar += 1 + +def get_query_pos_from_cigartuples(r, query_length): # Infer the position on the query sequence of the alignment using cigar string start = 0 - query_length = r.infer_read_length() # Note, this also counts hard-clips + # query_length = r.infer_read_length() # Note, this also counts hard-clips end = query_length - if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5: start += r.cigartuples[0][1] if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5: @@ -613,7 +662,7 @@ def get_query_pos_from_cigartuples(r): AlnBlock = namedtuple("SA", ["query_start", "query_end", "ref_start", "ref_end", "chrom", "mq", "strand", "this"]) -JoinEvent = namedtuple("JE", ["chrom", "event_pos", "chrom2", "pos2", "read_enum", "cigar_index"]) +JoinEvent = namedtuple("JE", ["chrom", "event_pos", "chrom2", "pos2", "query_pos", "query_end", "read_enum", "cigar_index"]) class AlignmentsSA: @@ -623,6 +672,7 @@ class AlignmentsSA: self.aln_strand = None self.query_aligns = [] self.join_result = [] + self.query_length = r.infer_read_length() # Note, this also counts hard-clips self._alignments_from_sa(r, gettid) def connect_alignments(self, r, max_dist=1000, mq_thresh=0, read_enum=0): @@ -633,18 +683,23 @@ class AlignmentsSA: self._connect_right(r, max_dist, mq_thresh, read_enum) def _alignments_from_sa(self, r, gettid): - qstart, qend, query_length = get_query_pos_from_cigartuples(r) + qstart, qend, query_length = get_query_pos_from_cigartuples(r, self.query_length) this_aln = AlnBlock(query_start=qstart, query_end=qend, ref_start=r.pos, ref_end=r.reference_end, chrom=r.rname, mq=r.mapq, strand="-" if r.flag & 16 else "+", this=True) self.aln_strand = this_aln.strand query_aligns = [this_aln] + + cdef int ref_start, ref_end, query_start, query_end for sa_block in r.get_tag("SA").split(";"): if sa_block == "": break + query_start = 0 + query_end = 0 sa = sa_block.split(",", 5) - matches = [(int(slen), opp) for slen, opp in re.findall(r'(\d+)([A-Z]{1})', sa[3])] - query_start, query_end, ref_start, ref_end = get_query_pos_from_cigarstring(matches, int(sa[1])) + ref_start = int(sa[1]) + ref_end = ref_start + parse_cigar(sa[3], query_start, query_end, ref_end) if this_aln.strand != sa[2]: start_temp = query_length - query_end query_end = start_temp + query_end - query_start @@ -658,13 +713,16 @@ class AlignmentsSA: break self.query_aligns = query_aligns - def _connect_left(self, r, max_dist, mq_thresh, read_enum): #, max_gap_size=100): + def _connect_left(self, r, max_dist, mq_thresh, read_enum): a = self.query_aligns[self.index] b = self.query_aligns[self.index - 1] - # gap = abs(a.query_start - b.query_end) - # if gap > max_gap_size: - # return event_pos = a.ref_start + query_pos = a.query_start + query_end = a.query_end + if r.is_reverse: + qtemp = query_pos + query_pos = self.query_length - query_end + query_end = self.query_length - qtemp chrom = a.chrom cigar_index = 0 if b.strand == self.aln_strand: @@ -673,21 +731,24 @@ class AlignmentsSA: pos2 = b.ref_start chrom2 = b.chrom if b.mq < mq_thresh: - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return elif self.paired: if not r.flag & 2304 and r.flag & 2 and r.pnext <= event_pos and (chrom != chrom2 or abs(pos2 - event_pos) > max_dist): - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return - self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, read_enum, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, query_pos, query_end, read_enum, cigar_index)) - def _connect_right(self, r, max_dist, mq_thresh, read_enum): #, max_gap_size=100): + def _connect_right(self, r, max_dist, mq_thresh, read_enum): a = self.query_aligns[self.index] b = self.query_aligns[self.index + 1] - # gap = abs(b.query_start - a.query_end) - # if gap > max_gap_size: - # return event_pos = a.ref_end + query_pos = a.query_start + query_end = a.query_end + if r.is_reverse: + qtemp = query_pos + query_pos = self.query_length - query_end + query_end = self.query_length - qtemp chrom = a.chrom cigar_index = len(r.cigartuples) - 1 if b.strand == self.aln_strand: @@ -696,17 +757,17 @@ class AlignmentsSA: pos2 = b.ref_end chrom2 = b.chrom if b.mq < mq_thresh: - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return elif self.paired: # If paired, and SA block fits between two normal primary alignments, and block is not local, then # ignore block and try and call insertion if not r.flag & 2304 and r.flag & 2 and r.pnext >= event_pos and (chrom != chrom2 or abs(pos2 - event_pos) > max_dist): # is primary, is proper pair. Use primary mate info, not SA - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return - self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, read_enum, cigar_index)) - + self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, query_pos, query_end, read_enum, cigar_index)) +# cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, pos, node_name): cdef int other_node cdef int count = 0 @@ -722,7 +783,7 @@ cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, po cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_scope, TemplateEdges_t template_edges, NodeToName node_to_name, genome_scanner, - int flag, int chrom, tell, int cigar_index, int event_pos, + int flag, int chrom, tell, int cigar_index, int event_pos, int query_pos, int chrom2, int pos2, ClipScoper_t clip_scope, ReadEnum_t read_enum, bint p1_overlaps, bint p2_overlaps, bint mm_only, int clip_l, site_adder, int length_from_cigar, bint trust_ins_len, bint paired_end): @@ -733,12 +794,8 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_ cdef uint64_t v = xxhasher(bam_get_qname(r._delegate), len(r.qname), 42) # Hash qname to save mem cdef int bnd_site_node, bnd_site_node2 cdef int q_start - # echo("\nADDING", node_name, r.qname, (chrom, event_pos), (chrom2, pos2), flag, read_enum) node_to_name.append(v, flag, r.pos, chrom, tell, cigar_index, event_pos) # Index this list to get the template_name genome_scanner.add_to_buffer(r, node_name, tell) # Add read to buffer - if read_enum < 2: # Prevents joining up within-read svs with between-read svs - q_start = r.query_alignment_start if not r.flag & 16 else r.infer_query_length() - r.query_alignment_end - template_edges.add(r.qname, flag, node_name, q_start) both_overlap = p1_overlaps and p2_overlaps if not paired_end or (paired_end and read_enum != BREAKEND and not mm_only and not chrom2 == -1): @@ -761,14 +818,19 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_ bnd_site_node2 = site_adder.find_nearest_site(chrom2, pos2) if bnd_site_node != bnd_site_node2 and bnd_site_node2 >= 0 and not G.hasEdge(node_name, bnd_site_node2): G.addEdge(node_name, bnd_site_node2, 0) + + if read_enum < 2: # Prevents joining up within-read svs with between-read svs + template_edges.add(r.qname, flag, node_name, query_pos) + # Debug: - # if r.qname == "M03762:232:000000000-L65J4:1:2111:17729:15161": + # if r.qname == "cfa54189-601a-4a98-aeca-60ea41d0b931": # echo("---", r.qname, read_enum, node_name, (event_pos, pos2), length_from_cigar, list(other_nodes)) - # look = {'a4d38568-fd80-4785-8fa5-84ed132b445c', '2313a985-385c-4c84-b02c-dddfc627940b', '0031840a-bd2d-475d-9a04-528f71c7b512'} + # look = {129,128 } + # if node_name == 25: # if r.qname in look: # # if r.qname == "D00360:18:H8VC6ADXX:1:1210:7039:44052": - # echo(r.qname, r.flag, node_name, (chrom, event_pos), (chrom2, pos2), list(other_nodes), - # cigar_index, length_from_cigar, "enum=", read_enum) + # echo(r.qname, length_from_cigar, "nodename:", node_name, list(other_nodes), (chrom, event_pos), (chrom2, pos2), + # f"enum={read_enum}") # echo(list(other_nodes)) @@ -898,6 +960,7 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int event_pos = j.event_pos chrom2 = j.chrom2 pos2 = j.pos2 + query_pos = j.query_pos read_enum = j.read_enum cigar_index = j.cigar_index if read_enum == BREAKEND: @@ -916,7 +979,7 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int pos2 = event_pos add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, chrom, - tell, cigar_index, event_pos, chrom2, pos2, clip_scope, read_enum, + tell, cigar_index, event_pos, query_pos, chrom2, pos2, clip_scope, read_enum, current_overlaps_roi, next_overlaps_roi, mm_only, clip_l, site_adder, 0, trust_ins_len, paired_end) return @@ -935,9 +998,13 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int pos2 = cigar_pos2 else: pos2 = event_pos + add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, chrom, - tell, cigar_index, event_pos, chrom2, pos2, clip_scope, read_enum, current_overlaps_roi, next_overlaps_roi, - mm_only, clip_l, site_adder, length_from_cigar, trust_ins_len, paired_end) + tell, cigar_index, event_pos, + 1, + chrom2, pos2, clip_scope, read_enum, + current_overlaps_roi, next_overlaps_roi, + mm_only, clip_l, site_adder, length_from_cigar, trust_ins_len, paired_end) ### else: # Single end current_overlaps_roi, next_overlaps_roi = False, False # not supported @@ -947,21 +1014,28 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int all_aligns.connect_alignments(r, loci_dist, mapq_thresh, read_enum) if not all_aligns.join_result: return - # for chrom, event_pos, chrom2, pos2, read_e, cigar_index in all_aligns.join_result: for j in all_aligns.join_result: add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, j.chrom, - tell, j.cigar_index, j.event_pos, j.chrom2, j.pos2, clip_scope, j.read_enum, + tell, j.cigar_index, j.event_pos, + j.query_pos, + j.chrom2, j.pos2, clip_scope, j.read_enum, current_overlaps_roi, next_overlaps_roi, - mm_only, clip_l, site_adder, 0, trust_ins_len, paired_end) - + mm_only, clip_l, site_adder, + 0, + trust_ins_len, paired_end) elif read_enum >= 2: # Sv within read or breakend + if read_enum == BREAKEND: + return chrom2 = r.rname if cigar_index != -1 and r.cigartuples[cigar_index][0] != 1: # If not insertion pos2 = cigar_pos2 else: pos2 = event_pos + add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, chrom, - tell, cigar_index, event_pos, chrom2, pos2, clip_scope, read_enum, + tell, cigar_index, event_pos, + 1, + chrom2, pos2, clip_scope, read_enum, current_overlaps_roi, next_overlaps_roi, mm_only, clip_l, site_adder, length_from_cigar, trust_ins_len, paired_end) @@ -1090,7 +1164,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering int paired_end=1, int read_length=150, bint contigs=True, float norm_thresh=100, float spd_thresh=0.3, bint mm_only=False, sites=None, bint trust_ins_len=True, low_mem=False, temp_dir=".", - find_n_aligned_bases=True): + find_n_aligned_bases=True, position_distance_thresh=0.8): logging.info("Building graph with clustering {} bp".format(clustering_dist)) cdef TemplateEdges_t template_edges = TemplateEdges() # Edges are added between alignments from same template, after building main graph cdef int event_pos, cigar_index, opp, length @@ -1099,7 +1173,8 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering minimizer_support_thresh=minimizer_support_thresh, minimizer_breadth=minimizer_breadth, read_length=read_length) # Infers long-range connections, outside local scope using pe information - cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end) + cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh) + bad_clip_counter = BadClipCounter(infile.header.nreferences, low_mem, temp_dir) cdef Py_SimpleGraph G = map_set_utils.Py_SimpleGraph() site_adder = None @@ -1157,7 +1232,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION)) added = True elif opp == 2: - if length >= min_sv_size: # elif! + if length >= min_sv_size: pos2 = event_pos + length events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION)) added = True @@ -1332,6 +1407,7 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts): seen_t.update(current_t) # Only count edge once for t in current_t: # save memory by converting support_between to array counts[t] = [np.fromiter(m, dtype="uint32", count=len(m)) for m in counts[t]] + return counts, self_counts @@ -1393,6 +1469,24 @@ cpdef break_large_component(Py_SimpleGraph G, component, int min_support): return jobs +@cython.auto_pickle(True) +cdef class GraphComponent: + cdef public object parts, s_between, reads, s_within, n2n, info + def __init__(self, parts, s_between, reads, s_within, n2n, info): + self.parts = parts + self.s_between = s_between + self.reads = reads + self.s_within = s_within + self.n2n = n2n + self.info = info + def as_dict(self): + return {"parts": self.parts, + "s_between": self.s_between, + "reads": self.reads, + "s_within": self.s_within, + "n2n": self.n2n, + "info": self.info} + cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGraph G, int min_support, int procs, int paired_end, sites_index): n2n = {} @@ -1428,36 +1522,27 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGrap if len(support_between) == 0 and len(support_within) == 0: if not paired_end: if len(n2n) >= min_support or len(reads) >= min_support or info: - d = {"parts": [], "s_between": {}, "reads": reads, "s_within": {}, "n2n": n2n} - if info: - d["info"] = info - return d + return GraphComponent((), (), {}, (), n2n, info) else: return else: # single paired end template can have 3 nodes e.g. two reads plus supplementary if min_support == 1 and (len(n2n) >= min_support or len(reads) >= min_support): - d = {"parts": [], "s_between": {}, "reads": reads, "s_within": {}, "n2n": n2n} - if info: - d["info"] = info - return d + return GraphComponent((), (), {}, (), n2n, info) elif len(reads) >= min_support or info: - d = {"parts": [], "s_between": {}, "reads": reads, "s_within": {}, "n2n": n2n} - if info: - d["info"] = info - return d + return GraphComponent((), (), {}, (), n2n, info) else: return # Debug: - # if 157835 in n2n: + # if 14 in n2n: # echo("parts", partitions) # echo("s_between", support_between) # echo("s_within", support_within) - + # if len(partitions) > 1: # echo("parts", partitions) # echo("s_between", support_between) # echo("s_within", support_within) - + # # echo("n2n", n2n.keys()) # node_look = set(range(653526, 653532)) # node_look = set(range(8)) @@ -1466,7 +1551,5 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGrap # echo("parts", partitions) # echo("s_between", sb) # echo("s_within", support_within) - d = {"parts": partitions, "s_between": support_between, "reads": reads, "s_within": support_within, "n2n": n2n} - if info: - d["info"] = info - return d + + return GraphComponent(partitions, tuple(support_between.items()), reads, tuple(support_within.items()), n2n, info) diff --git a/dysgu/main.py b/dysgu/main.py index f7badf9..dd22cba 100644 --- a/dysgu/main.py +++ b/dysgu/main.py @@ -189,6 +189,7 @@ def cli(): type=int, callback=add_option_set) @click.option('--dist-norm', help=f"Distance normalizer [default: {defaults['dist_norm']}]", type=float, callback=add_option_set) @click.option('--spd', help="Span position distance", default=0.3, type=float, show_default=True) +@click.option('--sd', help="Span distance, only SV span is considered, lower values separate multi-allelic sites", default=0.8, type=float, show_default=True) @click.option('--trust-ins-len', help=f"Trust insertion length from cigar, for high error rate reads use False [default: {defaults['trust_ins_len']}]", type=str, callback=add_option_set) @click.option('--length-extend', help=f"Extend SV length if any nearby gaps found with length >= length-extend. Ignored for paired-end reads", type=int, default=15, show_default=True) @@ -348,6 +349,7 @@ def get_reads(ctx, **kwargs): type=int, callback=add_option_set) @click.option('--dist-norm', help=f"Distance normalizer [default: {defaults['dist_norm']}]", type=float, callback=add_option_set) @click.option('--spd', help="Span position distance", default=0.3, type=float, show_default=True) +@click.option('--sd', help="Span distance, only SV span is considered, lower values separate multi-allelic sites", default=0.8, type=float, show_default=True) @click.option('--trust-ins-len', help=f"Trust insertion length from cigar, for high error rate reads use False [default: {defaults['trust_ins_len']}]", type=str, callback=add_option_set) @click.option('--length-extend', help=f"Extend SV length if any nearby gaps found with length >= length-extend. Ignored for paired-end reads", type=int, default=15, show_default=True) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index b9a0044..d856b25 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -286,9 +286,11 @@ def ref_repetitiveness(events, ref_genome): if e.svlen < 150 and e.svtype == "DEL": try: ref_seq = ref_genome.fetch(e.chrA, e.posA, e.posB).upper() - except ValueError: # todo out or range, needs fixing - continue - e.ref_rep = compute_rep(ref_seq) + e.ref_rep = compute_rep(ref_seq) + except (ValueError, KeyError, IndexError) as errors: + # Might be different reference genome version, compared to bam genome + logging.warning("Error fetching reference chromosome: {}".format(e.chrA), errors) + e.ref_seq = 0 return events diff --git a/dysgu/sv_category.pxd b/dysgu/sv_category.pxd index 105f3d0..7457a6f 100644 --- a/dysgu/sv_category.pxd +++ b/dysgu/sv_category.pxd @@ -7,6 +7,7 @@ cdef class AlignmentItem: a_len, b_len, query_gap, read_overlaps_mate, size_inferred, query_overlap, inferred_sv_len cdef public str svtype, join_type cdef public object read_a, read_b + cdef public object a_node_info, b_node_info cdef void classify_d(AlignmentItem v) \ No newline at end of file diff --git a/dysgu/sv_category.pyx b/dysgu/sv_category.pyx index 5189413..97199dd 100644 --- a/dysgu/sv_category.pyx +++ b/dysgu/sv_category.pyx @@ -6,16 +6,10 @@ from dysgu.map_set_utils import echo cdef class AlignmentItem: """Data holder for classifying alignments into SV types""" - # cdef public int chrA, chrB, priA, priB, rA, rB, posA, endA, posB, endB, strandA, strandB, left_clipA, right_clipA,\ - # left_clipB, right_clipB, breakA_precise, breakB_precise, breakA, breakB, a_qstart, a_qend, b_qstart, b_qend,\ - # a_len, b_len, query_gap, read_overlaps_mate, size_inferred, query_overlap, inferred_sv_len - # cdef public str svtype, join_type - # cdef public object read_a, read_b def __cinit__(self, int chrA, int chrB, int priA, int priB, int rA, int rB, int posA, int endA, int posB, int endB, int strandA, int strandB, int left_clipA, int right_clipA, int left_clipB, int right_clipB, int a_qstart, int a_qend, int b_qstart, int b_qend, int a_len, int b_len, - int read_overlaps_mate, object read_a, - object read_b): + int read_overlaps_mate, read_a, read_b, a_node_info=None, b_node_info=None): self.chrA = chrA self.chrB = chrB self.priA = priA @@ -41,7 +35,8 @@ cdef class AlignmentItem: self.read_overlaps_mate = read_overlaps_mate self.read_a = read_a self.read_b = read_b - + self.a_node_info = a_node_info + self.b_node_info = b_node_info # These will be determined self.breakA_precise = 0 self.breakB_precise = 0 @@ -53,6 +48,56 @@ cdef class AlignmentItem: self.inferred_sv_len = -1 self.size_inferred = 0 # set to 1 if insertion size was inferred +# cdef void from_node_info(AlignmentItem v): +# a_info = v.a_node_info +# b_info = v.b_node_info +# +# v.breakA = v.a_node_info.event_pos +# v.breakA_precise = 1 if v.a_node_info.read_enum == 1 else 0 +# v.breakB = v.b_node_info.event_pos +# v.breakB_precise = 1 if v.b_node_info.read_enum == 1 else 0 +# +# v.join_type = f"{v.strandA}to{v.strandB}" +# +# cdef int query_gap = 0 +# cdef int query_overlap = 0 +# if v.rA == v.rB: # same read +# if v.b_qstart < v.a_qstart: # B is first on query +# query_gap = v.a_qstart - v.b_qend +# else: +# query_gap = v.b_qstart - v.a_qend +# if query_gap < 0: +# query_overlap = abs(query_gap) +# query_gap = 0 +# +# v.query_gap = query_gap +# v.query_overlap = query_overlap +# +# if a_info.chrom != b_info.chrom: +# v.svtype = "TRA" +# else: +# if a_info.event_pos < b_info.event_pos: +# if a_info.connect_right: +# if not b_info.connect_right: +# v.svtype = "DEL" +# else: +# v.svtype = "INV" +# else: +# if not b_info.connect_right: +# v.svtype = "INV" +# else: +# v.svtype = "DUP" +# else: +# if a_info.connect_right: +# if not b_info.connect_right: +# v.svtype = "DUP" +# else: +# v.svtype = "INV" +# else: +# if not b_info.connect_right: +# v.svtype = "INV" +# else: +# v.svtype = "DEL" cdef void two_primary(AlignmentItem v): @@ -272,7 +317,6 @@ cdef void same_read(AlignmentItem v): v.breakB = v.endB v.svtype = "DUP" - # v.svtype = "INS" # Check if gap on query is bigger than gap on reference ref_gap = abs(v.breakB - v.breakA) if v.a_qstart > v.b_qstart: # B is first on query seq @@ -785,7 +829,6 @@ cdef void translocation(AlignmentItem v): v.query_gap = query_gap v.query_overlap = query_overlap - cdef void classify_d(AlignmentItem v): #, debug=False): v.breakA_precise = 0