From dc0ff467ff128abc40f6e1b3090ac58f4c2d96d9 Mon Sep 17 00:00:00 2001
From: Kez Cleal <42997789+kcleal@users.noreply.github.com>
Date: Fri, 31 Jan 2025 10:59:43 +0000
Subject: [PATCH 1/8] Dysgu_dev <-- master (#121)

* Dysgu dev (#115)

* dysgu_dev <- master (#108)

* correct usage of update_filter_value in filter_normals

It seems like at some point the usage of update_filter_value changed, because in several spots, it is missing the sample_name argument. This breaks `dysgu filter --keep-all`.

* Update main.yml

* Update main.yml

* Update requirements.txt

* updated build process to pyproject.toml (#103)

* v1.6.6 updated README

* Update main.yml

* Added pyproject.toml

* Updated pyproject.toml and setup.py

* Update pyproject.toml

---------

Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com>

* Better recall+precision for long-reads. Improved merging pipeline. Faster runtime. Code refactoring. WIP # [skip ci]

* Fixed issues with overriding presets. --mode option updated # [skip ci]

* Fixed issues with overriding presets. --mode option updated to support r10 and revio, new presets available. --no-gt flat removed # [skip ci]

* Fixed CLI issues for tests # [skip ci]

* Fixed API issue

* v1.7.0

* added deprecated warning for mode

* Added filter for high divergence reads (long-reads only) # [skip ci]

* Better read merging for long reads. New clustering methods for long reads. Improved runtime.

* Update main.yml

* Update main.yml

* Update osx-deps

* Update main.yml macos11

* Update main.yml

* Update main.yml

* Update main.yml

* Update osx-deps

* Update osx-deps

* Update osx-deps

* Update osx-deps

* Update main.yml

* Update osx-deps

* Update main.yml

* Updated workflow

* Updated workflow

* Updated workflow

* Update main.yml

* Updated workflow

---------

Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com>

* Fixes compile error for clang

* Fixed merge None type error. Added wb3 as default compression level for long-reads (run-pipeline) (#116)

* Updated workflow

* Updated workflow

* Fixed compiler warning

---------

Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com>
---
 .github/workflows/main.yml |  2 --
 dysgu/call_component.pyx   | 18 ++++++++----------
 dysgu/graph.pyx            |  4 ++--
 dysgu/map_set_utils.pxd    |  4 ++--
 dysgu/map_set_utils.pyx    | 12 ++++++------
 setup.py                   |  1 -
 6 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 33b9b6f..bf6a5fa 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,8 +11,6 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-20.04, macOS-14]
-        #os: [ubuntu-latest, macOS-11]
-
     steps:
       - uses: actions/checkout@v4
       - name: Set correct paths for Homebrew on macOS
diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 94a608d..71b3a84 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -105,7 +105,7 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
     cdef float clipped_base_quals = 0
     cdef int left_clip = 0
     cdef int right_clip = 0
-    clip_sizes(a, left_clip, right_clip)
+    clip_sizes(a, &left_clip, &right_clip)
     clipped_bases = left_clip + right_clip
     cdef const unsigned char[:] quals = a.query_qualities
     cdef int i
@@ -154,7 +154,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             er.n_unmapped_mates += 1
         left_clip = 0
         right_clip = 0
-        clip_sizes_hard(a, left_clip, right_clip)
+        clip_sizes_hard(a, &left_clip, &right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         has_sa = a.has_tag("SA")
@@ -206,7 +206,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
 
         left_clip = 0
         right_clip = 0
-        clip_sizes(a, left_clip, right_clip)
+        clip_sizes(a, &left_clip, &right_clip)
         if left_clip or right_clip:
             er.sc += 1
         if a.flag & 1:  # paired read
@@ -222,7 +222,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             er.NP += 1
         left_clip = 0
         right_clip = 0
-        clip_sizes_hard(a, left_clip, right_clip)
+        clip_sizes_hard(a, &left_clip, &right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         if a.has_tag("SA"):
@@ -507,7 +507,7 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std):
         v_item.svtype = "BND"
         left_clip = 0
         right_clip = 0
-        clip_sizes(aln, left_clip, right_clip)
+        clip_sizes(aln, &left_clip, &right_clip)
         clip_s = max(left_clip, right_clip)
         rand_insert_pos = 100 if not clip_s else clip_s
     v_item.inferred_sv_len = 0 if rand_insert_pos < 0 else rand_insert_pos
@@ -945,6 +945,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
 
 def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative,
                      generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag):
+
     # echo("PROCESS SPANNING")
     cdef int min_found_support = 0
     cdef str svtype, jointype
@@ -996,6 +997,7 @@ def process_spanning(bint paired_end, spanning_alignments, float divergence, len
 
         # 1.7.0
         # svlen = int(np.median([sp.cigar_item.len for sp in spanning_alignments]))
+
         posA = spanning_alignments[best_index].pos
         posB = spanning_alignments[best_index].end
         er.preciseA = True
@@ -2020,8 +2022,7 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
         for (u, v), d in data.s_between: #.items():
             rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info)   # [(Nodeinfo, alignment)..]
             rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info)
-            # echo("rd_u", [rr[1].qname for rr in rd_u])
-            # echo("rd_v", [rr[1].qname for rr in rd_v])
+
             total_reads = len(rd_u) + len(rd_v)
             buffered_reads += total_reads
             if add_to_buffer and buffered_reads > 50000:
@@ -2106,9 +2107,6 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std
     n_parts = len(data.parts) if data.parts else 0
     events = []
     info = data.info
-    # echo(data.parts)
-    # echo(data.s_between)
-    # echo(data.s_within)
     if data.reads is None:
         data.reads = {}
     # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge
diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
index 78a0e6e..cd0c984 100644
--- a/dysgu/graph.pyx
+++ b/dysgu/graph.pyx
@@ -252,7 +252,7 @@ cdef class ClipScoper:
                unordered_set[int]& clustered_nodes):
         cdef int clip_left = 0
         cdef int clip_right = 0
-        clip_sizes(r, clip_left, clip_right)
+        clip_sizes(r, &clip_left, &clip_right)
         if chrom != self.current_chrom:
             self.scope_left.clear()
             self.scope_right.clear()
@@ -1368,7 +1368,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
                 pos2 = -1
                 left_clip_size = 0
                 right_clip_size = 0
-                clip_sizes_hard(r, left_clip_size, right_clip_size)  # soft and hard-clips
+                clip_sizes_hard(r, &left_clip_size, &right_clip_size)  # soft and hard-clips
                 if r.flag & 8 and clipped:  # paired by inference
                     # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
                     # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):
diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd
index 2dcb72b..866a729 100644
--- a/dysgu/map_set_utils.pxd
+++ b/dysgu/map_set_utils.pxd
@@ -266,9 +266,9 @@ cdef extern from "<map>" namespace "std" nogil:
 
 cdef int cigar_exists(AlignedSegment r)
 
-cdef void clip_sizes(AlignedSegment r, int& left, int& right)
+cdef void clip_sizes(AlignedSegment r, int* left, int* right)
 
-cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right)
+cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right)
 
 cdef int cigar_clip(AlignedSegment r, int clip_length)
 
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
index 4b69999..e6ab2e0 100644
--- a/dysgu/map_set_utils.pyx
+++ b/dysgu/map_set_utils.pyx
@@ -223,7 +223,7 @@ cdef int cigar_exists(AlignedSegment r):
     return 0
 
 
-cdef void clip_sizes(AlignedSegment r, int& left, int& right):
+cdef void clip_sizes(AlignedSegment r, int* left, int* right):
     cdef uint32_t cigar_value
     cdef uint32_t cigar_l
     cdef uint32_t *cigar_p
@@ -235,14 +235,14 @@ cdef void clip_sizes(AlignedSegment r, int& left, int& right):
     cigar_value = cigar_p[0]
     opp = <int> cigar_value & 15
     if opp == 4:
-        left = <int> cigar_value >> 4
+        left[0] = <int> cigar_value >> 4
     cigar_value = cigar_p[cigar_l - 1]
     opp = <int> cigar_value & 15
     if opp == 4:
-        right = <int> cigar_value >> 4
+        right[0] = <int> cigar_value >> 4
 
 
-cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right):
+cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right):
     cdef uint32_t cigar_value
     cdef uint32_t cigar_l
     cdef uint32_t *cigar_p
@@ -255,11 +255,11 @@ cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right):
     cigar_value = cigar_p[0]
     opp = <int> cigar_value & 15
     if opp == 4 or opp == 5:
-        left = <int> cigar_value >> 4
+        left[0] = <int> cigar_value >> 4
     cigar_value = cigar_p[cigar_l - 1]
     opp = <int> cigar_value & 15
     if opp == 4 or opp == 5:
-        right = <int> cigar_value >> 4
+        right[0] = <int> cigar_value >> 4
 
 
 cdef int cigar_clip(AlignedSegment r, int clip_length):
diff --git a/setup.py b/setup.py
index ee20dde..4fdfa27 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,6 @@ def get_extra_args():
 extras = get_extra_args() + ["-Wno-sign-compare", "-Wno-unused-function",
                              "-Wno-unused-result", '-Wno-ignored-qualifiers',
                              "-Wno-deprecated-declarations", "-fpermissive",
-                             "-Wno-unreachable-code-fallthrough",
                              ]
 
 def get_extension_modules():

From 5702658354c2554b6017be493561592799336ccc Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Fri, 31 Jan 2025 11:33:22 +0000
Subject: [PATCH 2/8] Fixed compile errors. Add logging for HP tag usage

---
 dysgu/call_component.pyx | 7 ++++---
 dysgu/cluster.pyx        | 3 +++
 dysgu/extra_metrics.pyx  | 2 +-
 dysgu/map_set_utils.pyx  | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 71b3a84..39a850c 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -1459,8 +1459,8 @@ cdef tuple mask_soft_clips(AlignedSegment a, AlignedSegment b):
     cdef int left_clipB = 0
     cdef int right_clipB = 0
 
-    clip_sizes_hard(a, left_clipA, right_clipA)
-    clip_sizes_hard(b, left_clipB, right_clipB)
+    clip_sizes_hard(a, &left_clipA, &right_clipA)
+    clip_sizes_hard(b, &left_clipB, &right_clipB)
 
     cdef int a_template_start = 0
     cdef int b_template_start = 0
@@ -1643,7 +1643,8 @@ cdef call_from_reads(u_reads_info, v_reads_info, int insert_size, int insert_std
 
     cdef AlignedSegment a, b
     cdef uint32_t cigar_l_a, cigar_l_b
-    cdef uint32_t *cigar_p_a, *cigar_p_b
+    cdef uint32_t *cigar_p_a
+    cdef uint32_t *cigar_p_b
 
 
     for qname in [k for k in grp_u if k in grp_v]:  # Qname found on both sides
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index ad3aea4..1b21d90 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -249,6 +249,9 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
         sites_index = sites_adder.sites_index
     logging.info("Graph constructed")
 
+    if not args['no_phase'] and hp_tag_found:
+        logging.info("Using HP tag")
+
     auto_support = False
     if args["min_support"] != "auto":
         args["min_support"] = int(args["min_support"])
diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx
index 80d7160..aabf72e 100644
--- a/dysgu/extra_metrics.pyx
+++ b/dysgu/extra_metrics.pyx
@@ -95,7 +95,7 @@ cdef float soft_clip_qual_corr(reads):
         quals = r.query_qualities
         left_clip = 0
         right_clip = 0
-        clip_sizes(r, left_clip, right_clip)
+        clip_sizes(r, &left_clip, &right_clip)
 
         if quals is None:
             continue
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
index e6ab2e0..c882661 100644
--- a/dysgu/map_set_utils.pyx
+++ b/dysgu/map_set_utils.pyx
@@ -269,7 +269,7 @@ cdef int cigar_clip(AlignedSegment r, int clip_length):
         return 0
     cdef int left = 0
     cdef int right = 0
-    clip_sizes(r, left, right)
+    clip_sizes(r, &left, &right)
     if left >= clip_length or right >= clip_length:
         return 1
     return 0

From 634102f3bb75310cdc63a62714aa8f97e3161413 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Fri, 31 Jan 2025 11:50:35 +0000
Subject: [PATCH 3/8] Added superintervals dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 390e8d1..91c0451 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "networkx >= 2.4",
     "scikit-learn >= 0.22",
     "sortedcontainers",
+    "superintervals",
     "lightgbm"
 ]
 

From c4e1484a985aa50d246e8037b23c3d7faf2ff4c8 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Mon, 24 Feb 2025 13:05:56 +0000
Subject: [PATCH 4/8] Added PSET phaseset HP haplotype tag and AF allele
 frequency tags to output

---
 dysgu/call_component.pyx |  57 +++++---
 dysgu/cluster.pyx        |   2 -
 dysgu/coverage.pyx       |   3 +
 dysgu/io_funcs.pyx       |  47 +++----
 dysgu/map_set_utils.pxd  |   3 +-
 dysgu/map_set_utils.pyx  |  10 +-
 dysgu/post_call.py       | 273 ++++++++++++++++++++++-----------------
 dysgu/view.py            |   3 +
 8 files changed, 223 insertions(+), 175 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 39a850c..0d5ebcf 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -119,6 +119,32 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
     return aligned_base_quals, aligned_bases, clipped_base_quals, clipped_bases
 
 
+cdef collect_phase_tags(bint get_hp_tag, AlignedSegment a, EventResult_t er):
+    if not get_hp_tag:
+        return
+    if not a.has_tag("HP"):
+        if er.haplotype is None or 'u' not in er.haplotype:
+            er.haplotype = {'u': 1}
+        else:
+            er.haplotype['u'] += 1
+        return
+    hp_tag = a.get_tag("HP")
+    if not er.haplotype:
+        er.haplotype = {hp_tag: 1}
+    elif hp_tag not in er.haplotype:
+        er.haplotype[hp_tag] = 1
+    else:
+        er.haplotype[hp_tag] += 1
+    if a.has_tag("PS"):
+        pg_tag = a.get_tag("PS")
+        if not er.phase_set:
+            er.phase_set = {pg_tag: 1}
+        elif pg_tag not in er.phase_set:
+            er.phase_set[pg_tag] = 1
+        else:
+            er.phase_set[pg_tag] += 1
+
+
 cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
                        EventResult_t er, bint paired_end_reads, bint get_hp_tag):
     cdef float NMpri = 0
@@ -145,6 +171,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     cdef float a_bases, large_gaps, n_small_gaps
     cdef AlignedSegment a
     for index, a in enumerate(itertools.chain(reads1, reads2, [i.read_a for i in generic_ins])):
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
@@ -162,16 +189,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
+
+        collect_phase_tags(get_hp_tag, a, er)
 
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary (and not primary if -M if flagged using bwa)
@@ -190,7 +212,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             MAPQpri += a.mapq
             if paired_end_reads:
-                if index >= len(reads2) and a.qname in paired_end:
+                if index >= len(reads1) and a.qname in paired_end:
                     er.pe += 1
                 else:
                     paired_end.add(a.qname)
@@ -217,6 +239,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             clipped_bases += cb
 
     for a in spanning:
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
@@ -229,16 +252,10 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
 
+        collect_phase_tags(get_hp_tag, a, er)
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary
@@ -257,7 +274,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             if paired_end_reads:
                 if a.qname in paired_end:  # If two primary reads from same pair
-                    er.pe += 2
+                    er.pe += 1
                 else:
                     paired_end.add(a.qname)
             if a.has_tag("NM"):
@@ -296,6 +313,8 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     else:
         er.clip_qual_ratio = 0
 
+    er.a_freq += len(seen)
+
 
 cdef int within_read_end_position(int event_pos, CigarItem cigar_item):
     cdef int end
@@ -918,7 +937,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
                 hp_count += 1
                 lengths = [s.cigar_item.len for s in hp_spanning]
                 cluster_lengths(clusters, lengths, hp_spanning, eps)
-            # Merge near-identical haplotypes
+            # Merge near-identical haplotype
             if len(clusters) > 1:
                 cl_srt = sorted([ [ np.mean([c.cigar_item.len for c in clst]), clst ] for clst in clusters], key=lambda x: x[0])
                 merged_haps = [cl_srt[0]]
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index 1b21d90..6ce56ad 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -511,8 +511,6 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
 
     preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])
 
-    # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)
-
     preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
     preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)
 
diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
index 47e0f37..32ca3e5 100644
--- a/dysgu/coverage.pyx
+++ b/dysgu/coverage.pyx
@@ -621,5 +621,8 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
         if r.chrA != r.chrB:
             r.svlen = 1000000
         r.mcov = max_depth
+
+        r.a_freq = r.a_freq / reads_10kb
+        r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
         new_events.append(r)
     return new_events
diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx
index 4ac0bea..4d52d56 100644
--- a/dysgu/io_funcs.pyx
+++ b/dysgu/io_funcs.pyx
@@ -138,13 +138,13 @@ cpdef list col_names(small_output):
 
     if small_output:
         return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc",
-          ["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
+          ["GT", "GQ", "phase_set", "haplotype", "a_freq", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
          "raw_reads_10kb", "neigh10kb", "plus",
                 "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"]
             ]
     else:
         return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise",  "rep", "gc",
-          ["GT", "GQ", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
+          ["GT", "GQ", "phase_set", "haplotype", "a_freq",  "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
           "maxASsupp",  "su", "spanning", "pe", "supp", "sc", "bnd", "sqc", "scw", "clip_qual_ratio", "block_edge",
          "raw_reads_10kb", "mcov",
           "linked", "neigh", "neigh10kb",  "ref_bases", "plus",
@@ -157,11 +157,6 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
     rep, repsc, lenprec = 0, 0, 1
     mean_prob, max_prob = None, None
     debug = False
-    # if abs(r['posA'] - 39084726) < 10:
-    #     echo('found', dict(r))
-    #     echo(len(format_f) > 1)
-    #     echo([(int(v["su"]), v["posA"], v["event_id"], k) for k, v in df_rows.items()])
-    #     debug = True
     if len(format_f) > 1:
         best = sorted([(int(v["su"]), k) for k, v in df_rows.items()], reverse=True)[0][1]
         probs = [v["prob"] for k, v in df_rows.items()]
@@ -273,9 +268,9 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
         info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"]
 
     if small_output:
-        fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
+        fmt_keys = "GT:GQ:PSET:HP:AF:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
     else:
-        fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"
+        fmt_keys = "GT:GQ:PSET:HP:AF:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"
 
     if "variant_seq" in r and isinstance(r["variant_seq"], str):
         if r['svtype'] == "INS" or r.variant_seq:
@@ -318,13 +313,13 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
 
 def get_fmt(r, small_output):
     if small_output:
-        v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
+        v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
              r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"],
              r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob']
              ]
         return v
     else:
-        v = [r["GT"], r["GQ"], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
+        v = [r["GT"], r["GQ"],  r["phase_set"], r["haplotype"], r['a_freq'], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
              r['MAPQsupp'], r['NP'], r['maxASsupp'], r['su'], r['spanning'], r['pe'], r['supp'],
              r['sc'], r['bnd'], round(r['sqc'], 2), round(r['scw'], 1), round(r['clip_qual_ratio'], 3), r['block_edge'], r['raw_reads_10kb'], round(r['mcov'], 2), int(r['linked']), r['neigh'], r['neigh10kb'],
              r['ref_bases'], r["plus"], r["minus"], round(r["strand_binom_t"], 4), r['n_gaps'], round(r["n_sa"], 2),
@@ -407,6 +402,9 @@ def get_header(contig_names=""):
 ##ALT=<ID=TRA,Description="Translocation">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality phred scaled">
+##FORMAT=<ID=PSET,Number=1,Type=String,Description="Phase-set ID for phased SVs">
+##FORMAT=<ID=HP,Number=1,Type=String,Description="Phased read support HP1[|HP2|...HPn]_unphased. Leading underscore (e.g. _4) indicates all reads unphased. No underscore implies no unphased reads">
+##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
 ##FORMAT=<ID=NMP,Number=1,Type=Float,Description="Mean edit distance for primary alignments supporting the variant">
 ##FORMAT=<ID=NMS,Number=1,Type=Float,Description="Mean edit distance for supplementary alignments supporting the variant">
 ##FORMAT=<ID=NMB,Number=1,Type=Float,Description="Mean basic, edit distance. Gaps >= 30 bp are ignored">
@@ -460,7 +458,6 @@ def get_header(contig_names=""):
 def to_vcf(df, args, names, outfile, show_names=True,  contig_names="", header=None,
            small_output_f=True, sort_output=True, n_fields=None):
 
-
     outfile.write(get_header(contig_names) + "\t" + "\t".join(names) + "\n")
 
     if show_names:
@@ -478,21 +475,19 @@ def to_vcf(df, args, names, outfile, show_names=True,  contig_names="", header=N
 
     count = 0
     recs = []
-    if args is not None:
-        add_kind = args["add_kind"] == "True"
-        if args["metrics"]:
-            small_output_f = False
-        if args["verbosity"] == '0':
-            df["contigA"] = [''] * len(df)
-            df["contigB"] = [''] * len(df)
-        elif args["verbosity"] == '1':
-            has_alt = [True if isinstance(i, str) and i[0] != '<' else False for i in df['variant_seq']]
-            df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
-            df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]
-
-        n_fields = len(col_names(small_output_f)[-1])
-    else:
+
+    add_kind = args["add_kind"] == "True"
+    if args["metrics"]:
         small_output_f = False
+    if args["verbosity"] == '0':
+        df["contigA"] = [''] * len(df)
+        df["contigB"] = [''] * len(df)
+    elif args["verbosity"] == '1':
+        has_alt = [True if (isinstance(i, str) and len(i) >= 1 and i[0] != '<') else False for i in df['variant_seq']]
+        df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
+        df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]
+
+    n_fields = len(col_names(small_output_f)[-1])
 
     for idx, r in df.iterrows():
         if idx in seen_idx:
diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd
index 866a729..f27962a 100644
--- a/dysgu/map_set_utils.pxd
+++ b/dysgu/map_set_utils.pxd
@@ -297,4 +297,5 @@ cdef class EventResult:
     cdef public bint preciseA, preciseB, linked, modified, remapped
     cdef public int8_t svlen_precise
     cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \
-        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames, haplotypes
+        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info
+    cdef public object qnames, haplotype, phase_set, a_freq
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
index c882661..fec01cc 100644
--- a/dysgu/map_set_utils.pyx
+++ b/dysgu/map_set_utils.pyx
@@ -436,16 +436,8 @@ cdef class EventResult:
         self.n_sa = 0
         self.n_gaps = 0
         self.compress = 0
+        self.a_freq = 0
         self.qnames = set([])
 
     def __repr__(self):
         return str(to_dict(self))
-        # return str(self.to_dict())
-
-    # def __getstate__(self):  # for pickling
-    #     return to_dict(self)
-    #     # return self.to_dict()
-    #
-    # def __setstate__(self, d):
-    #     for k, v in d.items():
-    #         self.__setattr__(k, v)
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index cdfa231..931389e 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -9,8 +9,8 @@
 import glob
 import gzip
 import pandas as pd
-# import networkx as nx
-# from itertools import combinations
+import networkx as nx
+from itertools import combinations
 pd.options.mode.chained_assignment = None
 from dysgu.scikitbio._ssw_wrapper import StripedSmithWaterman
 from dysgu.consensus import compute_rep
@@ -324,6 +324,57 @@ def median(arr, start, end):
     return -1
 
 
+def get_ref_base(events, ref_genome, symbolic_sv_size):
+    for e in events:
+        if e.posA == 0:
+            e.posA = 1  # !!
+        if not e.ref_seq and not e.svtype == "BND":
+            if symbolic_sv_size == -1 or e.svtype in ("INS", "TRA", "INV"):
+                if e.posA == 0:
+                    e.posA = 1
+                try:
+                    base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
+                    e.ref_seq = base
+                except:
+                    pass
+            else:
+                bases = ""
+                if e.svlen < symbolic_sv_size:
+                    start = e.posA
+                    end = e.posB
+                else:
+                    start = e.posA - 1
+                    end - e.posA
+                try:
+                    bases = ref_genome.fetch(e.chrA, start, end).upper()
+                except:
+                    pass
+                if e.svtype == "DEL":
+                    e.ref_seq = bases
+                    e.variant_seq = bases[0] if bases else ""
+                else:
+                    e.variant_seq = bases
+                    e.ref_seq = bases[0] if bases else ""
+
+                #
+                #
+                #
+                #     try:
+                #         bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper()
+                #
+                #     except:
+                #         pass
+                #     e.ref_seq = bases
+                #     e.variant_seq = bases[0] if len(bases) else ""
+                # else:
+                #     try:
+                #         bases = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
+                #     except:
+                #         pass
+                #     e.ref_seq = bases
+    return events
+
+
 def nCk(n,k):
     f = math.factorial
     return f(n) // f(k) // f(n-k)
@@ -352,38 +403,6 @@ def strand_binom_t(events):
     return events
 
 
-def get_ref_base(events, ref_genome, symbolic_sv_size):
-    for e in events:
-        if not e.ref_seq and not e.svtype == "BND":
-            if symbolic_sv_size == -1 or e.svtype == "INS" or e.svtype == "TRA":
-                if e.posA == 0:
-                    e.posA = 1
-                try:
-                    base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
-                    e.ref_seq = base
-                except:
-                    pass
-            else:
-                if e.svlen < symbolic_sv_size:
-                    if e.posA == 0:
-                        e.posA = 1
-                    try:
-                        bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper()
-                        e.ref_seq = bases
-                        e.variant_seq = bases[0] if len(bases) else ""
-                    except:
-                        pass
-                else:
-                    if e.posA == 0:
-                        e.posA = 1
-                    try:
-                        base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper()
-                        e.ref_seq = base
-                    except:
-                        pass
-    return events
-
-
 # from svtyper with minor modifications
 # https://github.com/hall-lab/svtyper/blob/master/svtyper/singlesample.py
 # efficient combinatorial function to handle extremely large numbers
@@ -593,33 +612,102 @@ def is_reciprocal_overlapping(x1,x2, y1,y2):
                 e2.GT = "0/1"
 
 
-# def phase_haplotypes(events):
-#     qname_2_eventidx = defaultdict(set)
-#     for idx, e in enumerate(events):
-#         if e.qnames and e.GT.replace('/', '|') != '1|1' and e.GT[0] != '.':
-#             for qname in e.qnames:
-#                 qname_2_eventidx[qname].add(idx)
-#
-#     G = nx.Graph()
-#     for qname, idxs in qname_2_eventidx.items():
-#         for u, v in combinations(idxs, 2):
-#             if G.has_edge(u, v):
-#                 G[u][v]['weight'] += 1
-#             else:
-#                 G.add_edge(u, v, weight=1)
-#     bad_edges = []
-#     for e in G.edges(data=True):
-#         if e[2]['weight'] < 2:
-#             bad_edges.append(e)
-#     G.remove_edges_from(bad_edges)
-#     echo('G', len(G.edges()))
-#     for component in nx.connected_components(G):
-#         echo('component', component)
-#         gts = [events[idx].GT for idx in component]
-#         echo('gts', gts)
-#         echo(list(f'{e.chrA}:{e.posA}({e.svlen})' for e in [events[idx] for idx in component]))
-#         sub = G.subgraph(component)
-#         echo(sub.edges(data=True))
+def low_ps_support(r, support_fraction=0.1):
+    min_support = round(1.5 + support_fraction * r.su)
+    return min_support
+
+
+
+def join_phase_sets(events, ps_id):
+    # Join phase sets if support is greater than threshold
+    G = nx.Graph()
+    for idx, e in enumerate(events):
+        if not e.phase_set:
+            e.phase_set = ''
+            continue
+        if len(e.phase_set) > 1:
+            if e.GT not in '1|11/1':  # only heterozygous variants
+                threshold = low_ps_support(e)
+                passing_ps = [k for k, v in e.phase_set.items() if v >= threshold]
+                e.phase_set = passing_ps
+                if len(passing_ps) > 1:
+                    G.add_edges_from(combinations(passing_ps, 2))
+            else:
+                # Choose PS with maximum support
+                e.phase_set = str(max(e.phase_set, key=e.phase_set.get))
+        else:
+            e.phase_set = str(list(e.phase_set.keys())[0])
+
+    new_phase_set = {}
+    for sub in nx.connected_components(G):
+        for n in sub:
+            new_phase_set[n] = str(ps_id)
+        ps_id += 1
+
+    for e in events:
+        if not e.phase_set or isinstance(e.phase_set, str):
+            continue
+        assert isinstance(e.phase_set, list)
+        new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0]
+        if not new_p:
+            e.phase_set = ''
+        else:
+            e.phase_set = str(new_p)
+
+
+def get_hp_format(events):
+    max_ps = 0
+    any_phase_set = False
+    keys = set([])
+    for e in events:
+        if e.haplotype:
+            for k in e.haplotype:
+                if k != 'u':
+                    keys.add(k)
+    keys = sorted(list(keys))
+    for e in events:
+        if e.haplotype:
+            all_haps = e.haplotype
+            phased_haps = {k: v for k, v in e.haplotype.items() if k != 'u'}
+            n_haps = len(phased_haps)
+            n_phase_set = len(e.phase_set) if e.phase_set else 0
+            if n_phase_set:
+                ps = max(e.phase_set, key=e.phase_set.get)
+                if ps > max_ps:
+                    max_ps = ps
+                any_phase_set = True
+            else:
+                e.phase_set = ""
+            if n_haps >= 2:
+                if e.GT == "1/1":
+                    e.GT = "1|1"
+                else:
+                    best_hap = max(phased_haps, key=phased_haps.get)
+                    if best_hap == 1:
+                        e.GT = "1|0"
+                    else:
+                        e.GT = "0|1"
+            elif n_haps == 1:
+                if 1 in e.haplotype:
+                    e.GT = "1|0"
+                else:
+                    e.GT = "0|1"
+
+            hp_string = ''
+            if n_haps >= 1:
+                unphased = all_haps['u'] if 'u' in all_haps else 0
+                hp_string = '|'.join([str(phased_haps[k]) if k in phased_haps else '0' for k in keys])
+                if unphased:
+                    hp_string += f'_{unphased}'
+                if not hp_string:
+                    hp_string = "."
+
+            e.haplotype = hp_string
+        else:
+            e.haplotype = "."
+
+
+    return max_ps, any_phase_set
 
 
 def get_gt_metric2(events, mode, add_gt=True):
@@ -677,68 +765,17 @@ def get_gt_metric2(events, mode, add_gt=True):
 
     fix_inconsistent_gt(events)
 
-    # any_haps = False
-    for e in events:
-        if e.haplotypes:
-            # any_haps = True
-            n_haps = len(e.haplotypes)
-            if n_haps >= 2:
-                if e.GT == "1/1":
-                    e.GT = "1|1"
-                else:
-                    best_hap = max(e.haplotypes, key=e.haplotypes.get)
-                    if best_hap == 1:
-                        e.GT = "1|0"
-                    else:
-                        e.GT = "0|1"
-            elif n_haps == 1:
-                if 1 in e.haplotypes:
-                    e.GT = "1|0"
-                else:
-                    e.GT = "0|1"
+    max_ps, any_phase_set = get_hp_format(events)
 
-    # phase_haplotypes(events)
+    if any_phase_set:
+        join_phase_sets(events, max_ps + 1)
+    else:
+        for e in events:
+            e.phase_set = "."
 
     return events
 
 
-# def filter_microsatellite_non_diploid(potential):
-#     tmp_list = defaultdict(list)
-#     max_dist = 50
-#     half_d = (max_dist * 0.5)
-#     candidates = []
-#     for idx in range(len(potential)):
-#         ei = potential[idx]
-#         if (ei.svtype != "DEL" and ei.svtype != "INS") or not ei.variant_seq:
-#             continue
-#         rep = compute_rep(ei.variant_seq)
-#         if rep < 0.4:
-#             continue
-#         candidates.append(idx)
-#         tmp_list[ei.chrA].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx))
-#         if ei.chrA == ei.chrB and ei.svlen > half_d:
-#             tmp_list[ei.chrA].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx))
-#
-#     nc2 = {k: iitree(v, add_value=True) for k, v in tmp_list.items()}
-#     seen = set([])
-#     to_drop = set([])
-#     for idx in candidates:
-#         if idx in seen:
-#             continue
-#         ei = potential[idx]
-#         ols = set(nc2[ei.chrA].allOverlappingIntervals(ei.posA, ei.posA + 1))
-#         ols2 = set(nc2[ei.chrB].allOverlappingIntervals(ei.posB, ei.posB + 1))
-#         ols |= ols2
-#         ols.add(idx)
-#         if len(ols) <= 2:
-#             continue
-#         bad = sorted([(i, potential[i]) for i in ols], key=lambda x: x[1].su, reverse=True)[2:]
-#         for jdx, p in bad:
-#             to_drop.add(jdx)
-#         seen |= ols
-#     return [p for i, p in enumerate(potential) if i not in to_drop]
-
-
 def compressability(events):
     for e in events:
         c1 = []
diff --git a/dysgu/view.py b/dysgu/view.py
index 8fdf428..ca04718 100644
--- a/dysgu/view.py
+++ b/dysgu/view.py
@@ -362,6 +362,9 @@ def vcf_to_df(path):
                "JIT": ("jitter", float),
                "LEFT_SVINSSEQ": ("left_ins_seq", str),
                "RIGHT_SVINSSEQ": ("right_ins_seq", str),
+               "PSET": ("phase_set", str),
+               "HP": ("haplotype", str),
+               "AF": ("a_freq", float),
                }
     # df = df[df.posA == 110156314]
     df.rename(columns={k: v[0] for k, v in col_map.items()}, inplace=True)

From 672a90fd1be98273cd41742b3f118adbd2344023 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Mon, 24 Feb 2025 14:20:02 +0000
Subject: [PATCH 5/8] Fixed index error

---
 dysgu/post_call.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 931389e..c1119b9 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -648,11 +648,11 @@ def join_phase_sets(events, ps_id):
         if not e.phase_set or isinstance(e.phase_set, str):
             continue
         assert isinstance(e.phase_set, list)
-        new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0]
+        new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))
         if not new_p:
             e.phase_set = ''
         else:
-            e.phase_set = str(new_p)
+            e.phase_set = str(new_p[0])
 
 
 def get_hp_format(events):

From f86145b17a2eda41d2e6543503338ae643d8750a Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Wed, 26 Feb 2025 10:16:11 +0000
Subject: [PATCH 6/8] Changed empty PSET and HP to 0

---
 README.rst         | 30 ++++++++++++++++++++++--------
 dysgu/post_call.py |  6 +++---
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/README.rst b/README.rst
index 977a918..c79c873 100644
--- a/README.rst
+++ b/README.rst
@@ -42,7 +42,7 @@ for calling structural variants using paired-end or long read sequencing data. S
 ⚙️ Installation
 ---------------
 
-Dysgu can be installed via pip using Python 3.10 - 3.12 and has been tested on linux and MacOS.
+Dysgu can be installed via pip using Python >=3.10 and has been tested on linux and MacOS.
 The list of python packages needed can be found in requirements.txt.
 To install::
 
@@ -79,7 +79,8 @@ For help use::
     dysgu --help
     dysgu command --help
 
-To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API.html>`_, or `jupyter notebook <https://github.com/kcleal/dysgu/blob/master/dysgu_api_demo.ipynb>`_,
+To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API.html>`_,
+or `jupyter notebook <https://github.com/kcleal/dysgu/blob/master/dysgu_api_demo.ipynb>`_,
 
 
 🎯 Calling SVs
@@ -87,13 +88,16 @@ To use the python-API see the `documentation <https://kcleal.github.io/dysgu/API
 
 Paired-end reads
 ****************
-Dysgu was developed to work with paired-end reads aligned using bwa mem, although other aligners will work. To call SVs, a sorted and indexed .bam/cram is needed plus an indexed reference genome (fasta format). Also a working directory must
-be provided to store temporary files. There are a few ways to run dysgu depending on the type of data you have.
+Dysgu was developed to work with paired-end reads aligned using bwa mem, although other aligners will work.
+To call SVs, a sorted and indexed .bam/cram is needed plus an indexed reference genome (fasta format).
+Also a working directory must be provided to store temporary files.
+There are a few ways to run dysgu depending on the type of data you have.
 For paired-end data the `run` command is recommended which wraps `fetch` and `call`::
 
     dysgu run reference.fa temp_dir input.bam > svs.vcf
 
-This will first run the `fetch` command that will create a temporary bam file plus other analysis files in the directory `temp_dir`. These temporary files are then analysed using the `call` program.
+This will first run the `fetch` command that will create a temporary bam file plus other analysis files
+in the directory `temp_dir`. These temporary files are then analysed using the `call` program.
 To make use of multiprocessing, set the "-p" parameter::
 
     dysgu run -p4 reference.fa temp_dir input.bam > svs.vcf
@@ -104,14 +108,24 @@ Dysgu also accepts reads from stdin. In this example, the --clean flag will remo
 
 Long reads
 **********
-Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if starting with a bam file, or 'run' if starting with a cram::
+Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if
+starting with a bam file, or 'run' if starting with a cram::
 
     dysgu call --mode pacbio-revio reference.fa temp_dir input.bam > svs.vcf
 
     dysgu call --mode nanopore-r10 reference.fa temp_dir input.bam > svs.vcf
 
-Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`, and ONT `nanopore-r9 | nanopore-r10`
-If you are using using reads with higher error rates, or are unsure of the read-accuracy, it is recommended to set '--divergence auto', to infer a more conservative sequence divergence. The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in more reads being filtered and lower sensitivity::
+Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`,
+and ONT `nanopore-r9 | nanopore-r10`.
+
+Since v1.8, higher calling accuracy can be achieved by adding "haplotags" to your input bam/cram file
+(using hiphase / whatshap / longshot etc). Dysgu reads HP and PS tags automatically and produces
+phased output calls.
+
+If you are using using reads with higher error rates, or are unsure of the read-accuracy,
+it is recommended to set '--divergence auto', to infer a more conservative sequence divergence.
+The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in
+more reads being filtered and lower sensitivity::
 
     dysgu call --divergence auto --mode nanopore reference.fa temp_dir input.bam > svs.vcf
 
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index c1119b9..945c7f8 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -700,11 +700,11 @@ def get_hp_format(events):
                 if unphased:
                     hp_string += f'_{unphased}'
                 if not hp_string:
-                    hp_string = "."
+                    hp_string = "0"
 
             e.haplotype = hp_string
         else:
-            e.haplotype = "."
+            e.haplotype = "0"
 
 
     return max_ps, any_phase_set
@@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True):
         join_phase_sets(events, max_ps + 1)
     else:
         for e in events:
-            e.phase_set = "."
+            e.phase_set = "-1"
 
     return events
 

From 47d642cd6d9935541f8d7390f20a92e5722c4788 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Wed, 26 Feb 2025 10:20:37 +0000
Subject: [PATCH 7/8] PSET and HP default to empty string

---
 dysgu/post_call.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index 945c7f8..d400d10 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -700,11 +700,11 @@ def get_hp_format(events):
                 if unphased:
                     hp_string += f'_{unphased}'
                 if not hp_string:
-                    hp_string = "0"
+                    hp_string = ""
 
             e.haplotype = hp_string
         else:
-            e.haplotype = "0"
+            e.haplotype = ""
 
 
     return max_ps, any_phase_set
@@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True):
         join_phase_sets(events, max_ps + 1)
     else:
         for e in events:
-            e.phase_set = "-1"
+            e.phase_set = ""
 
     return events
 

From de5512f1527ffb55660205d8d2efbad936664ff4 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Wed, 26 Feb 2025 10:22:08 +0000
Subject: [PATCH 8/8] PSET and HP default to '-1' and '0'

---
 dysgu/post_call.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index d400d10..945c7f8 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -700,11 +700,11 @@ def get_hp_format(events):
                 if unphased:
                     hp_string += f'_{unphased}'
                 if not hp_string:
-                    hp_string = ""
+                    hp_string = "0"
 
             e.haplotype = hp_string
         else:
-            e.haplotype = ""
+            e.haplotype = "0"
 
 
     return max_ps, any_phase_set
@@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True):
         join_phase_sets(events, max_ps + 1)
     else:
         for e in events:
-            e.phase_set = ""
+            e.phase_set = "-1"
 
     return events