Skip to content

Commit

Permalink
#125 better calling of low support SVs
Browse files Browse the repository at this point in the history
  • Loading branch information
kcleal committed Mar 4, 2025
1 parent de5512f commit f296b84
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 218 deletions.
64 changes: 22 additions & 42 deletions dysgu/call_component.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,6 @@ cdef make_single_call(sub_informative, insert_size, insert_stdev, insert_ppf, mi
as2 = None
ref_bases = 0
if to_assemble or len(spanning_alignments) > 0:
# echo('MAKE SINGLE CALL')
if er.preciseA:
as1 = consensus.base_assemble(u_reads, er.posA, 500)
if as1 and (er.svtype != "TRA" or (as1['contig'] and (as1['contig'][0].islower() or as1['contig'][-1].islower()))):
Expand Down Expand Up @@ -965,7 +964,6 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative,
generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag):

# echo("PROCESS SPANNING")
cdef int min_found_support = 0
cdef str svtype, jointype
cdef bint passed
Expand Down Expand Up @@ -2055,33 +2053,10 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
seen.remove(u)
if v in seen:
seen.remove(v)

events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1,
assemble_contigs,
sites_info, paired_end, hp_tag)

# finds reads that should be a single partition
# u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v)
# if len(u_reads) > 0 and len(v_reads) > 0:
# events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs,
# sites_info, paired_end)
# if u_single:
# res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
# sites_info, paired_end, length_extend, divergence)
# if res:
# if isinstance(res, EventResult):
# events.append(res)
# else:
# events += res
# if v_single:
# res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
# sites_info, paired_end, length_extend, divergence)
# if res:
# if isinstance(res, EventResult):
# events.append(res)
# else:
# events += res

# Process any singles / unconnected blocks
if seen:
for part_key in seen:
Expand All @@ -2101,23 +2076,28 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
events += res

# Check for events within clustered nodes
if data.s_within:
for k, d in data.s_within: #.items():
o_count = out_counts[k]
i_counts = len(d)
if i_counts > max_single_size:
continue
if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count:
rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
continue
res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
sites_info, paired_end, length_extend, divergence, hp_tag)
if res:
if isinstance(res, EventResult):
events.append(res)
else:
events += res
# if data.s_within:
# for k, d in data.s_within:
# o_count = out_counts[k]
# i_counts = len(d)
#
# if i_counts > max_single_size:
# continue
# # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count:
# # if o_count > 0 or i_counts > 0:
# rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
# if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
# continue
# echo('Calling single 1')
# res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
# sites_info, paired_end, length_extend, divergence, hp_tag)
# for r in res:
# echo(r.posA, r.posB, r.svlen)
# if res:
# if isinstance(res, EventResult):
# events.append(res)
# else:
# events += res
return events


Expand Down
2 changes: 1 addition & 1 deletion dysgu/cluster.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
if args["pl"] == "pe": # reads with internal SVs can be detected at lower support
lower_bound_support = min_support - 1 if min_support - 1 > 1 else 1
else:
lower_bound_support = 2
lower_bound_support = min_support

component_path = f"{tdir}/components.bin"
cdef bytes cmp_file = component_path.encode("ascii") # write components to file if low-mem used
Expand Down
62 changes: 13 additions & 49 deletions dysgu/coverage.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,6 @@ cdef class GenomeScanner:
self.procs = read_threads
self.buff_size = buffer_size
self.current_bin = []
self.current_cov = 0
self.current_chrom = 0
self.current_pos = 0
self.current_cov_array = None
self.reads_dropped = 0
self.depth_d = {}
Expand Down Expand Up @@ -152,10 +149,10 @@ cdef class GenomeScanner:

if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0: # not primary, duplicate or unmapped?
continue
self._add_to_bin_buffer(aln, tell)
yield aln, tell
tell = 0 if self.no_tell else self.input_bam.tell()
while len(self.staged_reads) > 0:
yield self.staged_reads.popleft()
# yield aln, tell

# when 'call' command was run only. generate coverage track here
else:
self.cpp_cov_track.set_max_cov(self.max_cov)
Expand All @@ -167,7 +164,7 @@ cdef class GenomeScanner:

cigar_l = aln._delegate.core.n_cigar
if cigar_l == 0 or aln.flag & 1284:
tell = 0 if self.no_tell else self.input_bam.tell()
# tell = 0 if self.no_tell else self.input_bam.tell()
continue

if aln.rname != self.current_tid:
Expand Down Expand Up @@ -203,18 +200,14 @@ cdef class GenomeScanner:
index_start += length

if aln.mapq < mq_thresh or not good_read or not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos):
tell = 0 if self.no_tell else self.input_bam.tell()
continue

self._add_to_bin_buffer(aln, tell)
yield aln, tell
tell = 0 if self.no_tell else self.input_bam.tell()
while len(self.staged_reads) > 0:
yield self.staged_reads.popleft()
# yield aln, tell

if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
self.cpp_cov_track.write_track(out_path)
if len(self.current_bin) > 0:
yield self.current_bin

else:
logging.info("Searching --regions and collecting reads from mate-pair locations")
Expand Down Expand Up @@ -291,16 +284,13 @@ cdef class GenomeScanner:
continue
if not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos):
continue
self._add_to_bin_buffer(aln, tell)
yield aln, tell
tell = 0 if self.no_tell else self.input_bam.tell()
while len(self.staged_reads) > 0:
yield self.staged_reads.popleft()
# yield aln, tell

if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
self.cpp_cov_track.write_track(out_path)
if len(self.current_bin) > 0:
yield self.current_bin


def get_read_properties(self, int max_tlen, int insert_median, int insert_stdev, int read_len, ibam=None, find_divergence=False):
Expand Down Expand Up @@ -352,7 +342,8 @@ cdef class GenomeScanner:
continue
tell = 0 if self.no_tell else self.input_bam.tell()
if self.no_tell:
self._add_to_bin_buffer(a, tell)
self.staged_reads.append((a, tell))
# self._add_to_bin_buffer(a, tell)
if a.rname != self.current_tid:
if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path.outpath, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
Expand Down Expand Up @@ -447,7 +438,7 @@ cdef class GenomeScanner:
# Read the rest of the genome, reads are sent in blocks
cdef int total_reads = 0
for staged in self._get_reads():
total_reads += len(staged)
total_reads += 1 # len(staged)
yield staged
if total_reads == 0:
logging.critical("No reads found, finishing")
Expand All @@ -464,33 +455,6 @@ cdef class GenomeScanner:
elif self.no_tell:
raise BufferError("Read buffer has overflowed, increase --buffer-size")

def _add_to_bin_buffer(self, AlignedSegment a, tell):
# Calculates coverage information on fly, drops high coverage regions, buffers reads
cdef int flag = a.flag
cdef uint32_t cigar_l = a._delegate.core.n_cigar
if flag & 1540 or cigar_l == 0: # or a.seq is None:
return
cdef int rname = a.rname
cdef int apos = a.pos
cdef int bin_pos = int(apos / 100)
cdef int ref_length
cdef str reference_name = ""
cdef int aend = a.reference_end
cdef float current_coverage
if self.current_chrom != rname:
self.current_chrom = rname
# in_roi = False
# if self.overlap_regions:
# in_roi = intersecter(self.overlap_regions, a.rname, apos, apos+1)
if rname == self.current_chrom and bin_pos == self.current_pos:
self.current_bin.append((a, tell))
else:
if len(self.current_bin) != 0:
self.staged_reads.append(self.current_bin)
self.current_chrom = rname
self.current_pos = bin_pos
self.current_bin = [(a, tell)]


cdef float add_coverage(int start, int end, DTYPE_t[:] chrom_depth) nogil:
cdef float fs = start / 100
Expand Down Expand Up @@ -622,7 +586,7 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
r.svlen = 1000000
r.mcov = max_depth

r.a_freq = r.a_freq / reads_10kb
r.a_freq = r.a_freq / (reads_10kb + 1e-5)
r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
new_events.append(r)
return new_events
Loading

0 comments on commit f296b84

Please sign in to comment.