From f296b8452c7b20dbffd69c02aef1a55291614c8e Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Tue, 4 Mar 2025 15:18:14 +0000
Subject: [PATCH] #125 better calling of low support SVs

---
 dysgu/call_component.pyx |  64 ++++-------
 dysgu/cluster.pyx        |   2 +-
 dysgu/coverage.pyx       |  62 +++--------
 dysgu/graph.pyx          | 234 +++++++++++++++++++--------------------
 dysgu/post_call.py       |   9 +-
 pyproject.toml           |   2 +-
 6 files changed, 155 insertions(+), 218 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 0d5ebcf..40c259b 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -672,7 +672,6 @@ cdef make_single_call(sub_informative, insert_size, insert_stdev, insert_ppf, mi
     as2 = None
     ref_bases = 0
     if to_assemble or len(spanning_alignments) > 0:
-        # echo('MAKE SINGLE CALL')
         if er.preciseA:
             as1 = consensus.base_assemble(u_reads, er.posA, 500)
             if as1 and (er.svtype != "TRA" or (as1['contig'] and (as1['contig'][0].islower() or as1['contig'][-1].islower()))):
@@ -965,7 +964,6 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
 def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative,
                      generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag):
 
-    # echo("PROCESS SPANNING")
     cdef int min_found_support = 0
     cdef str svtype, jointype
     cdef bint passed
@@ -2055,33 +2053,10 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
                 seen.remove(u)
             if v in seen:
                 seen.remove(v)
-
             events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1,
                                assemble_contigs,
                                sites_info, paired_end, hp_tag)
 
-            # finds reads that should be a single partition
-            # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v)
-            # if len(u_reads) > 0 and len(v_reads) > 0:
-            #     events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs,
-            #                        sites_info, paired_end)
-            # if u_single:
-            #     res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
-            #                  sites_info, paired_end, length_extend, divergence)
-            #     if res:
-            #         if isinstance(res, EventResult):
-            #             events.append(res)
-            #         else:
-            #             events += res
-            # if v_single:
-            #     res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
-            #                  sites_info, paired_end, length_extend, divergence)
-            #     if res:
-            #         if isinstance(res, EventResult):
-            #             events.append(res)
-            #         else:
-            #             events += res
-
     # Process any singles / unconnected blocks
     if seen:
         for part_key in seen:
@@ -2101,23 +2076,28 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
                         events += res
 
     # Check for events within clustered nodes
-    if data.s_within:
-        for k, d in data.s_within:  #.items():
-            o_count = out_counts[k]
-            i_counts = len(d)
-            if i_counts > max_single_size:
-                continue
-            if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count:
-                rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
-                if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
-                        continue
-                res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
-                             sites_info, paired_end, length_extend, divergence, hp_tag)
-                if res:
-                    if isinstance(res, EventResult):
-                        events.append(res)
-                    else:
-                        events += res
+    # if data.s_within:
+    #     for k, d in data.s_within:
+    #         o_count = out_counts[k]
+    #         i_counts = len(d)
+    #
+    #         if i_counts > max_single_size:
+    #             continue
+    #         # if o_count > 0 and i_counts >= (2*min_support) and i_counts > o_count:
+    #         # if o_count > 0 or i_counts > 0:
+    #         rds = get_reads(bam, d, data.reads, data.n2n, 0, info)
+    #         if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0):
+    #             continue
+    #         echo('Calling single 1')
+    #         res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
+    #                      sites_info, paired_end, length_extend, divergence, hp_tag)
+    #         for r in res:
+    #             echo(r.posA, r.posB, r.svlen)
+    #         if res:
+    #             if isinstance(res, EventResult):
+    #                 events.append(res)
+    #             else:
+    #                 events += res
     return events
 
 
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index 6ce56ad..dd9b56c 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -273,7 +273,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
     if args["pl"] == "pe":  # reads with internal SVs can be detected at lower support
         lower_bound_support = min_support - 1 if min_support - 1 > 1 else 1
     else:
-        lower_bound_support = 2
+        lower_bound_support = min_support
 
     component_path = f"{tdir}/components.bin"
     cdef bytes cmp_file = component_path.encode("ascii")  # write components to file if low-mem used
diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
index 32ca3e5..1df5430 100644
--- a/dysgu/coverage.pyx
+++ b/dysgu/coverage.pyx
@@ -112,9 +112,6 @@ cdef class GenomeScanner:
         self.procs = read_threads
         self.buff_size = buffer_size
         self.current_bin = []
-        self.current_cov = 0
-        self.current_chrom = 0
-        self.current_pos = 0
         self.current_cov_array = None
         self.reads_dropped = 0
         self.depth_d = {}
@@ -152,10 +149,10 @@ cdef class GenomeScanner:
 
                     if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0:  # not primary, duplicate or unmapped?
                         continue
-                    self._add_to_bin_buffer(aln, tell)
+                    yield aln, tell
                     tell = 0 if self.no_tell else self.input_bam.tell()
-                    while len(self.staged_reads) > 0:
-                        yield self.staged_reads.popleft()
+                    # yield aln, tell
+
             # when 'call' command was run only. generate coverage track here
             else:
                 self.cpp_cov_track.set_max_cov(self.max_cov)
@@ -167,7 +164,7 @@ cdef class GenomeScanner:
 
                     cigar_l = aln._delegate.core.n_cigar
                     if cigar_l == 0 or aln.flag & 1284:
-                        tell = 0 if self.no_tell else self.input_bam.tell()
+                        # tell = 0 if self.no_tell else self.input_bam.tell()
                         continue
 
                     if aln.rname != self.current_tid:
@@ -203,18 +200,14 @@ cdef class GenomeScanner:
                             index_start += length
 
                     if aln.mapq < mq_thresh or not good_read or not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos):
-                        tell = 0 if self.no_tell else self.input_bam.tell()
                         continue
-
-                    self._add_to_bin_buffer(aln, tell)
+                    yield aln, tell
                     tell = 0 if self.no_tell else self.input_bam.tell()
-                    while len(self.staged_reads) > 0:
-                        yield self.staged_reads.popleft()
+                    # yield aln, tell
+
                 if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
                     out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
                     self.cpp_cov_track.write_track(out_path)
-            if len(self.current_bin) > 0:
-                yield self.current_bin
 
         else:
             logging.info("Searching --regions and collecting reads from mate-pair locations")
@@ -291,16 +284,13 @@ cdef class GenomeScanner:
                         continue
                     if not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos):
                         continue
-                    self._add_to_bin_buffer(aln, tell)
+                    yield aln, tell
                     tell = 0 if self.no_tell else self.input_bam.tell()
-                    while len(self.staged_reads) > 0:
-                        yield self.staged_reads.popleft()
+                    # yield aln, tell
 
                 if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
                     out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
                     self.cpp_cov_track.write_track(out_path)
-                if len(self.current_bin) > 0:
-                    yield self.current_bin
 
 
     def get_read_properties(self, int max_tlen, int insert_median, int insert_stdev, int read_len, ibam=None, find_divergence=False):
@@ -352,7 +342,8 @@ cdef class GenomeScanner:
                     continue
                 tell = 0 if self.no_tell else self.input_bam.tell()
                 if self.no_tell:
-                    self._add_to_bin_buffer(a, tell)
+                    self.staged_reads.append((a, tell))
+                    # self._add_to_bin_buffer(a, tell)
                     if a.rname != self.current_tid:
                         if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
                             out_path = "{}/{}.dysgu_chrom.bin".format(self.cov_track_path.outpath, self.input_bam.get_reference_name(self.current_tid)).encode("ascii")
@@ -447,7 +438,7 @@ cdef class GenomeScanner:
         # Read the rest of the genome, reads are sent in blocks
         cdef int total_reads = 0
         for staged in self._get_reads():
-            total_reads += len(staged)
+            total_reads += 1  # len(staged)
             yield staged
         if total_reads == 0:
             logging.critical("No reads found, finishing")
@@ -464,33 +455,6 @@ cdef class GenomeScanner:
         elif self.no_tell:
             raise BufferError("Read buffer has overflowed, increase --buffer-size")
 
-    def _add_to_bin_buffer(self, AlignedSegment a, tell):
-        # Calculates coverage information on fly, drops high coverage regions, buffers reads
-        cdef int flag = a.flag
-        cdef uint32_t cigar_l = a._delegate.core.n_cigar
-        if flag & 1540 or cigar_l == 0: # or a.seq is None:
-            return
-        cdef int rname = a.rname
-        cdef int apos = a.pos
-        cdef int bin_pos = int(apos / 100)
-        cdef int ref_length
-        cdef str reference_name = ""
-        cdef int aend = a.reference_end
-        cdef float current_coverage
-        if self.current_chrom != rname:
-            self.current_chrom = rname
-        # in_roi = False
-        # if self.overlap_regions:
-        #     in_roi = intersecter(self.overlap_regions, a.rname, apos, apos+1)
-        if rname == self.current_chrom and bin_pos == self.current_pos:
-            self.current_bin.append((a, tell))
-        else:
-            if len(self.current_bin) != 0:
-                self.staged_reads.append(self.current_bin)
-            self.current_chrom = rname
-            self.current_pos = bin_pos
-            self.current_bin = [(a, tell)]
-
 
 cdef float add_coverage(int start, int end, DTYPE_t[:] chrom_depth) nogil:
     cdef float fs = start / 100
@@ -622,7 +586,7 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
             r.svlen = 1000000
         r.mcov = max_depth
 
-        r.a_freq = r.a_freq / reads_10kb
+        r.a_freq = r.a_freq / (reads_10kb + 1e-5)
         r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
         new_events.append(r)
     return new_events
diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
index cd0c984..2f0353d 100644
--- a/dysgu/graph.pyx
+++ b/dysgu/graph.pyx
@@ -827,7 +827,6 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_
         pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar,
                                   trust_ins_len)
 
-        # other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len)
         if not dereference(found_nodes_exact).empty():
             for other_node in dereference(found_nodes_exact):
                 if node_to_name.same_template(node_name, other_node):
@@ -1289,144 +1288,135 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
     cdef int n_checked_for_hp_tag = 0
     if no_phase:
         n_checked_for_hp_tag = 10_001
-    for chunk in genome_scanner.iter_genome():
-        for r, tell in chunk:
-            if r.mapq < mapq_thresh:
+
+    for r, tell in genome_scanner.iter_genome():
+        if r.mapq < mapq_thresh:
+            continue
+        pos2 = -1
+        event_pos = r.pos
+        added = False
+        clipped = 0
+        events_to_add.clear()
+        cigar_l = r._delegate.core.n_cigar
+        cigar_p = bam_get_cigar(r._delegate)
+        if not paired_end:
+            if n_checked_for_hp_tag < 10_000:
+                n_checked_for_hp_tag += 1
+                if r.has_tag("HP"):
+                    hp_tag_found = <bint>True
+                    n_checked_for_hp_tag = 10_001
+            if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")):
                 continue
-            pos2 = -1
-            event_pos = r.pos
-            added = False
-            clipped = 0
-            events_to_add.clear()
-            cigar_l = r._delegate.core.n_cigar
-            cigar_p = bam_get_cigar(r._delegate)
-            if not paired_end:
-                if n_checked_for_hp_tag < 10_000:
-                    n_checked_for_hp_tag += 1
-                    if r.has_tag("HP"):
-                        hp_tag_found = <bint>True
-                        n_checked_for_hp_tag = 10_001
-                if r.has_tag("NM") and edit_distance_too_high(cigar_p, cigar_l, max_divergence, r.get_tag("NM")):
+
+        if cigar_l > 1:
+            if r.has_tag("SA"):
+                # Set cigar-index to -1 means it is unset, will be determined during SA parsing
+                cigar_index = -1
+                process_alignment(G, r, clip_l, max_dist, gettid,
+                                  overlap_regions, clustering_dist, pe_scope,
+                                  cigar_index, event_pos, paired_end, tell, genome_scanner,
+                                  template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT,
+                                  bad_clip_counter, mm_only, site_adder, 0, trust_ins_len)
+                added = True
+            for cigar_index in range(cigar_l):
+                cigar_value = cigar_p[cigar_index]
+                opp = <int> cigar_value & 15
+                length = <int> cigar_value >> 4
+
+                if opp == 0 or opp == 7 or opp == 8:
+                    if find_n_aligned_bases:
+                        n_aligned_bases += length
+                    event_pos += length
                     continue
 
-            if cigar_l > 1:
-                if r.has_tag("SA"):
-                    # Set cigar-index to -1 means it is unset, will be determined during SA parsing
-                    cigar_index = -1
+                if opp == 1:
+                    if length >= min_sv_size:
+                        pos2 = event_pos + length
+                        events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION))
+                        added = True
+                elif opp == 2:
+                    if length >= min_sv_size:
+                        pos2 = event_pos + length
+                        events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION))
+                        added = True
+                    event_pos += length
+                elif opp == 3:
+                    if length >= min_sv_size:
+                        pos2 = event_pos + length
+                        events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
+                        added = True
+                    event_pos += length
+                else:
+                    if opp != 4 and opp != 5:
+                        event_pos += length
+                    elif opp == 4 and length >= clip_l:
+                        clipped = 1
+
+        if (paired_end and not added and contigs) or not paired_end:
+            # Whole alignment will be used, try infer position from soft-clip
+            cigar_index = -1
+            pos2 = -1
+            left_clip_size = 0
+            right_clip_size = 0
+            clip_sizes_hard(r, &left_clip_size, &right_clip_size)  # soft and hard-clips
+            if r.flag & 8 and clipped:  # paired by inference
+                # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
+                # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):
+                if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20):
+                    # Mate is unmapped, insertion type. Only add if soft-clip is available
                     process_alignment(G, r, clip_l, max_dist, gettid,
                                       overlap_regions, clustering_dist, pe_scope,
                                       cigar_index, event_pos, paired_end, tell, genome_scanner,
-                                      template_edges, node_to_name, pos2, mapq_thresh, clip_scope, ReadEnum_t.SPLIT,
-                                      bad_clip_counter, mm_only, site_adder, 0, trust_ins_len)
-                    added = True
-                for cigar_index in range(cigar_l):
-                    cigar_value = cigar_p[cigar_index]
-                    opp = <int> cigar_value & 15
-                    length = <int> cigar_value >> 4
-
-                    if opp == 0 or opp == 7 or opp == 8:
-                        if find_n_aligned_bases:
-                            n_aligned_bases += length
-                        event_pos += length
-                        continue
-
-                    if opp == 1:
-                        if length >= min_sv_size:
-                            # Insertion type
-                            # if event_pos < 998000 or event_pos > 999000:
-                            #     continue
-                            pos2 = event_pos + length
-                            # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.INSERTION:
-                            #     continue
-                            # else:
-                            events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION))
-                            added = True
-                    elif opp == 2:
-                        if length >= min_sv_size:
-                            pos2 = event_pos + length
-                            # if not events_to_add.empty() and event_pos - events_to_add.back().event_pos < elide_threshold and events_to_add.back().read_enum == ReadEnum_t.DELETION:
-                            #     continue
-                            # else:
-                            events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION))
-                            added = True
-                        event_pos += length
-                    elif opp == 3:
-                        if length >= min_sv_size:
-                            pos2 = event_pos + length
-                            events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
-                            added = True
-                        event_pos += length
-                    else:
-                        if opp != 4 and opp != 5:
-                            event_pos += length
-                        elif opp == 4 and length >= clip_l:
-                            clipped = 1
-
-            if (paired_end and not added and contigs) or not paired_end:
-                # Whole alignment will be used, try infer position from soft-clip
-                cigar_index = -1
-                pos2 = -1
-                left_clip_size = 0
-                right_clip_size = 0
-                clip_sizes_hard(r, &left_clip_size, &right_clip_size)  # soft and hard-clips
-                if r.flag & 8 and clipped:  # paired by inference
-                    # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
-                    # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):
-                    if not (left_clip_size and right_clip_size) and good_quality_clip(r, 20):
-                        # Mate is unmapped, insertion type. Only add if soft-clip is available
+                                      template_edges, node_to_name,
+                                      pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter,
+                                      mm_only, site_adder, 0, trust_ins_len)
+            else:
+                # Use whole read, could be normal or discordant
+                if not paired_end:
+                    if max(left_clip_size, right_clip_size) > 250:
+                        read_enum = ReadEnum_t.BREAKEND
+                        if left_clip_size > right_clip_size:
+                            event_pos = r.pos  # else reference_end is used
                         process_alignment(G, r, clip_l, max_dist, gettid,
                                           overlap_regions, clustering_dist, pe_scope,
                                           cigar_index, event_pos, paired_end, tell, genome_scanner,
                                           template_edges, node_to_name,
-                                          pos2, mapq_thresh, clip_scope, ReadEnum_t.BREAKEND, bad_clip_counter,
+                                          pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
                                           mm_only, site_adder, 0, trust_ins_len)
-                else:
-                    # Use whole read, could be normal or discordant
-                    if not paired_end:
-                        if max(left_clip_size, right_clip_size) > 250:
-                            read_enum = ReadEnum_t.BREAKEND
-                            if left_clip_size > right_clip_size:
-                                event_pos = r.pos  # else reference_end is used
-                            process_alignment(G, r, clip_l, max_dist, gettid,
-                                              overlap_regions, clustering_dist, pe_scope,
-                                              cigar_index, event_pos, paired_end, tell, genome_scanner,
-                                              template_edges, node_to_name,
-                                              pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
-                                              mm_only, site_adder, 0, trust_ins_len)
 
+                else:
+                    if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext:
+                        if not clipped:
+                            continue
+                        read_enum = ReadEnum_t.BREAKEND
                     else:
-                        if r.flag & 2 and abs(r.tlen) < max_dist and r.rname == r.rnext:
-                            if not clipped:
-                                continue
-                            read_enum = ReadEnum_t.BREAKEND
-                        else:
-                            read_enum = ReadEnum_t.DISCORDANT
+                        read_enum = ReadEnum_t.DISCORDANT
 
-                        if left_clip_size or right_clip_size:
-                            if left_clip_size > right_clip_size:
-                                event_pos = r.pos  # else reference_end is used
-                        process_alignment(G, r, clip_l, max_dist, gettid,
-                                          overlap_regions, clustering_dist, pe_scope,
-                                          cigar_index, event_pos, paired_end, tell, genome_scanner,
-                                          template_edges, node_to_name,
-                                          pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
-                                          mm_only, site_adder, 0, trust_ins_len)
-            # process within-read events
-            if not events_to_add.empty():
-                itr_events = events_to_add.begin()
-                while itr_events != events_to_add.end():
-                    v = dereference(itr_events)
-                    if v.cigar_skip:
-                        pos2 = v.pos2
-                    else:
-                        pos2 = v.event_pos + v.length  # fall back on original cigar event length
+                    if left_clip_size or right_clip_size:
+                        if left_clip_size > right_clip_size:
+                            event_pos = r.pos  # else reference_end is used
                     process_alignment(G, r, clip_l, max_dist, gettid,
                                       overlap_regions, clustering_dist, pe_scope,
-                                      v.cigar_index, v.event_pos, paired_end, tell, genome_scanner,
+                                      cigar_index, event_pos, paired_end, tell, genome_scanner,
                                       template_edges, node_to_name,
-                                      v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter,
-                                      mm_only, site_adder, v.length, trust_ins_len)
-                    preincrement(itr_events)
+                                      pos2, mapq_thresh, clip_scope, read_enum, bad_clip_counter,
+                                      mm_only, site_adder, 0, trust_ins_len)
+        # process within-read events
+        if not events_to_add.empty():
+            itr_events = events_to_add.begin()
+            while itr_events != events_to_add.end():
+                v = dereference(itr_events)
+                if v.cigar_skip:
+                    pos2 = v.pos2
+                else:
+                    pos2 = v.event_pos + v.length  # fall back on original cigar event length
+                process_alignment(G, r, clip_l, max_dist, gettid,
+                                  overlap_regions, clustering_dist, pe_scope,
+                                  v.cigar_index, v.event_pos, paired_end, tell, genome_scanner,
+                                  template_edges, node_to_name,
+                                  v.pos2, mapq_thresh, clip_scope, v.read_enum, bad_clip_counter,
+                                  mm_only, site_adder, v.length, trust_ins_len)
+                preincrement(itr_events)
 
     add_template_edges(G, template_edges)
     if site_adder:
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 945c7f8..4ae45cb 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -2,7 +2,7 @@
 import numpy as np
 from dysgu.map_set_utils import echo
 from dysgu import re_map
-from dysgu.io_funcs import reverse_complement, intersecter #, iitree
+from dysgu.io_funcs import reverse_complement, intersecter
 import zlib
 import math
 import pickle
@@ -17,8 +17,11 @@
 from collections import defaultdict, Counter
 import os
 import warnings
-from sklearn.exceptions import InconsistentVersionWarning
-warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning)
+try:
+    from sklearn.exceptions import InconsistentVersionWarning
+    warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning)
+except ImportError:
+    pass
 
 
 def get_badclip_metric(events, bad_clip_counter, bam, regions):
diff --git a/pyproject.toml b/pyproject.toml
index 91c0451..13eaf63 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dysgu"
-version = "1.8.0"
+version = "1.8.1"
 description = "Structural variant calling"
 authors = [
     { name = "Kez Cleal", email = "clealk@cardiff.ac.uk" }