kcleal · kcleal · Feb 26, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -11,8 +11,6 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-20.04, macOS-14]
-        #os: [ubuntu-latest, macOS-11]
-
     steps:
       - uses: actions/checkout@v4
       - name: Set correct paths for Homebrew on macOS

diff --git a/README.rst b/README.rst
@@ -42,7 +42,7 @@ for calling structural variants using paired-end or long read sequencing data. S
 ⚙️ Installation
 ---------------
 
-Dysgu can be installed via pip using Python 3.10 - 3.12 and has been tested on linux and MacOS.
+Dysgu can be installed via pip using Python >=3.10 and has been tested on linux and MacOS.
 The list of python packages needed can be found in requirements.txt.
 To install::
 
@@ -79,21 +79,25 @@ For help use::
     dysgu --help
     dysgu command --help
 
-To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API.html>`_, or `jupyter notebook <https://github.com/kcleal/dysgu/blob/master/dysgu_api_demo.ipynb>`_,
+To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API.html>`_,
+or `jupyter notebook <https://github.com/kcleal/dysgu/blob/master/dysgu_api_demo.ipynb>`_,
 
 
 🎯 Calling SVs
 --------------
 
 Paired-end reads
 ****************
-Dysgu was developed to work with paired-end reads aligned using bwa mem, although other aligners will work. To call SVs, a sorted and indexed .bam/cram is needed plus an indexed reference genome (fasta format). Also a working directory must
-be provided to store temporary files. There are a few ways to run dysgu depending on the type of data you have.
+Dysgu was developed to work with paired-end reads aligned using bwa mem, although other aligners will work.
+To call SVs, a sorted and indexed .bam/cram is needed plus an indexed reference genome (fasta format).
+Also a working directory must be provided to store temporary files.
+There are a few ways to run dysgu depending on the type of data you have.
 For paired-end data the `run` command is recommended which wraps `fetch` and `call`::
 
     dysgu run reference.fa temp_dir input.bam > svs.vcf
 
-This will first run the `fetch` command that will create a temporary bam file plus other analysis files in the directory `temp_dir`. These temporary files are then analysed using the `call` program.
+This will first run the `fetch` command that will create a temporary bam file plus other analysis files
+in the directory `temp_dir`. These temporary files are then analysed using the `call` program.
 To make use of multiprocessing, set the "-p" parameter::
 
     dysgu run -p4 reference.fa temp_dir input.bam > svs.vcf
@@ -104,14 +108,24 @@ Dysgu also accepts reads from stdin. In this example, the --clean flag will remo
 
 Long reads
 **********
-Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if starting with a bam file, or 'run' if starting with a cram::
+Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if
+starting with a bam file, or 'run' if starting with a cram::
 
     dysgu call --mode pacbio-revio reference.fa temp_dir input.bam > svs.vcf
 
     dysgu call --mode nanopore-r10 reference.fa temp_dir input.bam > svs.vcf
 
-Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`, and ONT `nanopore-r9 | nanopore-r10`
-If you are using using reads with higher error rates, or are unsure of the read-accuracy, it is recommended to set '--divergence auto', to infer a more conservative sequence divergence. The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in more reads being filtered and lower sensitivity::
+Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`,
+and ONT `nanopore-r9 | nanopore-r10`.
+
+Since v1.8, higher calling accuracy can be achieved by adding "haplotags" to your input bam/cram file
+(using hiphase / whatshap / longshot etc). Dysgu reads HP and PS tags automatically and produces
+phased output calls.
+
+If you are using using reads with higher error rates, or are unsure of the read-accuracy,
+it is recommended to set '--divergence auto', to infer a more conservative sequence divergence.
+The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in
+more reads being filtered and lower sensitivity::
 
     dysgu call --divergence auto --mode nanopore reference.fa temp_dir input.bam > svs.vcf
 

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
@@ -105,7 +105,7 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
     cdef float clipped_base_quals = 0
     cdef int left_clip = 0
     cdef int right_clip = 0
-    clip_sizes(a, left_clip, right_clip)
+    clip_sizes(a, &left_clip, &right_clip)
     clipped_bases = left_clip + right_clip
     cdef const unsigned char[:] quals = a.query_qualities
     cdef int i
@@ -119,6 +119,32 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
     return aligned_base_quals, aligned_bases, clipped_base_quals, clipped_bases
 
 
+cdef collect_phase_tags(bint get_hp_tag, AlignedSegment a, EventResult_t er):
+    if not get_hp_tag:
+        return
+    if not a.has_tag("HP"):
+        if er.haplotype is None or 'u' not in er.haplotype:
+            er.haplotype = {'u': 1}
+        else:
+            er.haplotype['u'] += 1
+        return
+    hp_tag = a.get_tag("HP")
+    if not er.haplotype:
+        er.haplotype = {hp_tag: 1}
+    elif hp_tag not in er.haplotype:
+        er.haplotype[hp_tag] = 1
+    else:
+        er.haplotype[hp_tag] += 1
+    if a.has_tag("PS"):
+        pg_tag = a.get_tag("PS")
+        if not er.phase_set:
+            er.phase_set = {pg_tag: 1}
+        elif pg_tag not in er.phase_set:
+            er.phase_set[pg_tag] = 1
+        else:
+            er.phase_set[pg_tag] += 1
+
+
 cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
                        EventResult_t er, bint paired_end_reads, bint get_hp_tag):
     cdef float NMpri = 0
@@ -145,6 +171,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     cdef float a_bases, large_gaps, n_small_gaps
     cdef AlignedSegment a
     for index, a in enumerate(itertools.chain(reads1, reads2, [i.read_a for i in generic_ins])):
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
@@ -154,24 +181,19 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             er.n_unmapped_mates += 1
         left_clip = 0
         right_clip = 0
-        clip_sizes_hard(a, left_clip, right_clip)
+        clip_sizes_hard(a, &left_clip, &right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         has_sa = a.has_tag("SA")
         if has_sa:
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
+
+        collect_phase_tags(get_hp_tag, a, er)
 
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary (and not primary if -M if flagged using bwa)
@@ -190,7 +212,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             MAPQpri += a.mapq
             if paired_end_reads:
-                if index >= len(reads2) and a.qname in paired_end:
+                if index >= len(reads1) and a.qname in paired_end:
                     er.pe += 1
                 else:
                     paired_end.add(a.qname)
@@ -206,7 +228,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
 
         left_clip = 0
         right_clip = 0
-        clip_sizes(a, left_clip, right_clip)
+        clip_sizes(a, &left_clip, &right_clip)
         if left_clip or right_clip:
             er.sc += 1
         if a.flag & 1:  # paired read
@@ -217,28 +239,23 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             clipped_bases += cb
 
     for a in spanning:
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
         left_clip = 0
         right_clip = 0
-        clip_sizes_hard(a, left_clip, right_clip)
+        clip_sizes_hard(a, &left_clip, &right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         if a.has_tag("SA"):
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
 
+        collect_phase_tags(get_hp_tag, a, er)
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary
@@ -257,7 +274,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             if paired_end_reads:
                 if a.qname in paired_end:  # If two primary reads from same pair
-                    er.pe += 2
+                    er.pe += 1
                 else:
                     paired_end.add(a.qname)
             if a.has_tag("NM"):
@@ -296,6 +313,8 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     else:
         er.clip_qual_ratio = 0
 
+    er.a_freq += len(seen)
+
 
 cdef int within_read_end_position(int event_pos, CigarItem cigar_item):
     cdef int end
@@ -507,7 +526,7 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std):
         v_item.svtype = "BND"
         left_clip = 0
         right_clip = 0
-        clip_sizes(aln, left_clip, right_clip)
+        clip_sizes(aln, &left_clip, &right_clip)
         clip_s = max(left_clip, right_clip)
         rand_insert_pos = 100 if not clip_s else clip_s
     v_item.inferred_sv_len = 0 if rand_insert_pos < 0 else rand_insert_pos
@@ -918,7 +937,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
                 hp_count += 1
                 lengths = [s.cigar_item.len for s in hp_spanning]
                 cluster_lengths(clusters, lengths, hp_spanning, eps)
-            # Merge near-identical haplotypes
+            # Merge near-identical haplotype
             if len(clusters) > 1:
                 cl_srt = sorted([ [ np.mean([c.cigar_item.len for c in clst]), clst ] for clst in clusters], key=lambda x: x[0])
                 merged_haps = [cl_srt[0]]
@@ -945,6 +964,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
 
 def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative,
                      generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag):
+
     # echo("PROCESS SPANNING")
     cdef int min_found_support = 0
     cdef str svtype, jointype
@@ -996,6 +1016,7 @@ def process_spanning(bint paired_end, spanning_alignments, float divergence, len
 
         # 1.7.0
         # svlen = int(np.median([sp.cigar_item.len for sp in spanning_alignments]))
+
         posA = spanning_alignments[best_index].pos
         posB = spanning_alignments[best_index].end
         er.preciseA = True
@@ -1457,8 +1478,8 @@ cdef tuple mask_soft_clips(AlignedSegment a, AlignedSegment b):
     cdef int left_clipB = 0
     cdef int right_clipB = 0
 
-    clip_sizes_hard(a, left_clipA, right_clipA)
-    clip_sizes_hard(b, left_clipB, right_clipB)
+    clip_sizes_hard(a, &left_clipA, &right_clipA)
+    clip_sizes_hard(b, &left_clipB, &right_clipB)
 
     cdef int a_template_start = 0
     cdef int b_template_start = 0
@@ -1641,7 +1662,8 @@ cdef call_from_reads(u_reads_info, v_reads_info, int insert_size, int insert_std
 
     cdef AlignedSegment a, b
     cdef uint32_t cigar_l_a, cigar_l_b
-    cdef uint32_t *cigar_p_a, *cigar_p_b
+    cdef uint32_t *cigar_p_a
+    cdef uint32_t *cigar_p_b
 
 
     for qname in [k for k in grp_u if k in grp_v]:  # Qname found on both sides
@@ -2020,8 +2042,7 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
         for (u, v), d in data.s_between: #.items():
             rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info)   # [(Nodeinfo, alignment)..]
             rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info)
-            # echo("rd_u", [rr[1].qname for rr in rd_u])
-            # echo("rd_v", [rr[1].qname for rr in rd_v])
+
             total_reads = len(rd_u) + len(rd_v)
             buffered_reads += total_reads
             if add_to_buffer and buffered_reads > 50000:
@@ -2106,9 +2127,6 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std
     n_parts = len(data.parts) if data.parts else 0
     events = []
     info = data.info
-    # echo(data.parts)
-    # echo(data.s_between)
-    # echo(data.s_within)
     if data.reads is None:
         data.reads = {}
     # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge

diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
@@ -249,6 +249,9 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
         sites_index = sites_adder.sites_index
     logging.info("Graph constructed")
 
+    if not args['no_phase'] and hp_tag_found:
+        logging.info("Using HP tag")
+
     auto_support = False
     if args["min_support"] != "auto":
         args["min_support"] = int(args["min_support"])
@@ -508,8 +511,6 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
 
     preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])
 
-    # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)
-
     preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
     preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)
 

diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
@@ -621,5 +621,8 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
         if r.chrA != r.chrB:
             r.svlen = 1000000
         r.mcov = max_depth
+
+        r.a_freq = r.a_freq / reads_10kb
+        r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
         new_events.append(r)
     return new_events
diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx
@@ -95,7 +95,7 @@ cdef float soft_clip_qual_corr(reads):
         quals = r.query_qualities
         left_clip = 0
         right_clip = 0
-        clip_sizes(r, left_clip, right_clip)
+        clip_sizes(r, &left_clip, &right_clip)
 
         if quals is None:
             continue

diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
@@ -252,7 +252,7 @@ cdef class ClipScoper:
                unordered_set[int]& clustered_nodes):
         cdef int clip_left = 0
         cdef int clip_right = 0
-        clip_sizes(r, clip_left, clip_right)
+        clip_sizes(r, &clip_left, &clip_right)
         if chrom != self.current_chrom:
             self.scope_left.clear()
             self.scope_right.clear()
@@ -1368,7 +1368,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
                 pos2 = -1
                 left_clip_size = 0
                 right_clip_size = 0
-                clip_sizes_hard(r, left_clip_size, right_clip_size)  # soft and hard-clips
+                clip_sizes_hard(r, &left_clip_size, &right_clip_size)  # soft and hard-clips
                 if r.flag & 8 and clipped:  # paired by inference
                     # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
                     # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):