Added PSET phaseset HP haplotype tag and AF allele frequency tags to …

…output
kcleal · Feb 24, 2025 · c4e1484 · c4e1484
1 parent 634102f
commit c4e1484
Show file tree

Hide file tree

Showing 8 changed files with 223 additions and 175 deletions.
diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
@@ -119,6 +119,32 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
     return aligned_base_quals, aligned_bases, clipped_base_quals, clipped_bases
 
 
+cdef collect_phase_tags(bint get_hp_tag, AlignedSegment a, EventResult_t er):
+    if not get_hp_tag:
+        return
+    if not a.has_tag("HP"):
+        if er.haplotype is None or 'u' not in er.haplotype:
+            er.haplotype = {'u': 1}
+        else:
+            er.haplotype['u'] += 1
+        return
+    hp_tag = a.get_tag("HP")
+    if not er.haplotype:
+        er.haplotype = {hp_tag: 1}
+    elif hp_tag not in er.haplotype:
+        er.haplotype[hp_tag] = 1
+    else:
+        er.haplotype[hp_tag] += 1
+    if a.has_tag("PS"):
+        pg_tag = a.get_tag("PS")
+        if not er.phase_set:
+            er.phase_set = {pg_tag: 1}
+        elif pg_tag not in er.phase_set:
+            er.phase_set[pg_tag] = 1
+        else:
+            er.phase_set[pg_tag] += 1
+
+
 cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
                        EventResult_t er, bint paired_end_reads, bint get_hp_tag):
     cdef float NMpri = 0
@@ -145,6 +171,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     cdef float a_bases, large_gaps, n_small_gaps
     cdef AlignedSegment a
     for index, a in enumerate(itertools.chain(reads1, reads2, [i.read_a for i in generic_ins])):
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
@@ -162,16 +189,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
+
+        collect_phase_tags(get_hp_tag, a, er)
 
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary (and not primary if -M if flagged using bwa)
@@ -190,7 +212,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             MAPQpri += a.mapq
             if paired_end_reads:
-                if index >= len(reads2) and a.qname in paired_end:
+                if index >= len(reads1) and a.qname in paired_end:
                     er.pe += 1
                 else:
                     paired_end.add(a.qname)
@@ -217,6 +239,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             clipped_bases += cb
 
     for a in spanning:
+        seen.add(a.qname)
         flag = a.flag
         if flag & 2:
             er.NP += 1
@@ -229,16 +252,10 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             n_sa += a.get_tag("SA").count(";")
         if a.has_tag("XA"):
             n_xa += a.get_tag("XA").count(";")
-        if get_hp_tag and a.has_tag("HP"):
-            hp_tag = a.get_tag("HP")
-            if not er.haplotypes:
-                er.haplotypes = {hp_tag: 1}
-            elif hp_tag not in er.haplotypes:
-                er.haplotypes[hp_tag] = 1
-            else:
-                er.haplotypes[hp_tag] += 1
 
+        collect_phase_tags(get_hp_tag, a, er)
         a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)
+
         if a_bases > 0:
             n_gaps += n_small_gaps / a_bases
         if flag & 2304:  # Supplementary
@@ -257,7 +274,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             total_pri += 1
             if paired_end_reads:
                 if a.qname in paired_end:  # If two primary reads from same pair
-                    er.pe += 2
+                    er.pe += 1
                 else:
                     paired_end.add(a.qname)
             if a.has_tag("NM"):
@@ -296,6 +313,8 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     else:
         er.clip_qual_ratio = 0
 
+    er.a_freq += len(seen)
+
 
 cdef int within_read_end_position(int event_pos, CigarItem cigar_item):
     cdef int end
@@ -918,7 +937,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
                 hp_count += 1
                 lengths = [s.cigar_item.len for s in hp_spanning]
                 cluster_lengths(clusters, lengths, hp_spanning, eps)
-            # Merge near-identical haplotypes
+            # Merge near-identical haplotype
             if len(clusters) > 1:
                 cl_srt = sorted([ [ np.mean([c.cigar_item.len for c in clst]), clst ] for clst in clusters], key=lambda x: x[0])
                 merged_haps = [cl_srt[0]]

diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
@@ -511,8 +511,6 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
 
     preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])
 
-    # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)
-
     preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
     preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)
 

diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
@@ -621,5 +621,8 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
         if r.chrA != r.chrB:
             r.svlen = 1000000
         r.mcov = max_depth
+
+        r.a_freq = r.a_freq / reads_10kb
+        r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
         new_events.append(r)
     return new_events
diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx
@@ -138,13 +138,13 @@ cpdef list col_names(small_output):
 
     if small_output:
         return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc",
-          ["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
+          ["GT", "GQ", "phase_set", "haplotype", "a_freq", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
          "raw_reads_10kb", "neigh10kb", "plus",
                 "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"]
             ]
     else:
         return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise",  "rep", "gc",
-          ["GT", "GQ", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
+          ["GT", "GQ", "phase_set", "haplotype", "a_freq",  "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
           "maxASsupp",  "su", "spanning", "pe", "supp", "sc", "bnd", "sqc", "scw", "clip_qual_ratio", "block_edge",
          "raw_reads_10kb", "mcov",
           "linked", "neigh", "neigh10kb",  "ref_bases", "plus",
@@ -157,11 +157,6 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
     rep, repsc, lenprec = 0, 0, 1
     mean_prob, max_prob = None, None
     debug = False
-    # if abs(r['posA'] - 39084726) < 10:
-    #     echo('found', dict(r))
-    #     echo(len(format_f) > 1)
-    #     echo([(int(v["su"]), v["posA"], v["event_id"], k) for k, v in df_rows.items()])
-    #     debug = True
     if len(format_f) > 1:
         best = sorted([(int(v["su"]), k) for k, v in df_rows.items()], reverse=True)[0][1]
         probs = [v["prob"] for k, v in df_rows.items()]
@@ -273,9 +268,9 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
         info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"]
 
     if small_output:
-        fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
+        fmt_keys = "GT:GQ:PSET:HP:AF:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
     else:
-        fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"
+        fmt_keys = "GT:GQ:PSET:HP:AF:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"
 
     if "variant_seq" in r and isinstance(r["variant_seq"], str):
         if r['svtype'] == "INS" or r.variant_seq:
@@ -318,13 +313,13 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
 
 def get_fmt(r, small_output):
     if small_output:
-        v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
+        v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
              r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"],
              r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob']
              ]
         return v
     else:
-        v = [r["GT"], r["GQ"], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
+        v = [r["GT"], r["GQ"],  r["phase_set"], r["haplotype"], r['a_freq'], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
              r['MAPQsupp'], r['NP'], r['maxASsupp'], r['su'], r['spanning'], r['pe'], r['supp'],
              r['sc'], r['bnd'], round(r['sqc'], 2), round(r['scw'], 1), round(r['clip_qual_ratio'], 3), r['block_edge'], r['raw_reads_10kb'], round(r['mcov'], 2), int(r['linked']), r['neigh'], r['neigh10kb'],
              r['ref_bases'], r["plus"], r["minus"], round(r["strand_binom_t"], 4), r['n_gaps'], round(r["n_sa"], 2),
@@ -407,6 +402,9 @@ def get_header(contig_names=""):
 ##ALT=<ID=TRA,Description="Translocation">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality phred scaled">
+##FORMAT=<ID=PSET,Number=1,Type=String,Description="Phase-set ID for phased SVs">
+##FORMAT=<ID=HP,Number=1,Type=String,Description="Phased read support HP1[|HP2|...HPn]_unphased. Leading underscore (e.g. _4) indicates all reads unphased. No underscore implies no unphased reads">
+##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
 ##FORMAT=<ID=NMP,Number=1,Type=Float,Description="Mean edit distance for primary alignments supporting the variant">
 ##FORMAT=<ID=NMS,Number=1,Type=Float,Description="Mean edit distance for supplementary alignments supporting the variant">
 ##FORMAT=<ID=NMB,Number=1,Type=Float,Description="Mean basic, edit distance. Gaps >= 30 bp are ignored">
@@ -460,7 +458,6 @@ def get_header(contig_names=""):
 def to_vcf(df, args, names, outfile, show_names=True,  contig_names="", header=None,
            small_output_f=True, sort_output=True, n_fields=None):
 
-
     outfile.write(get_header(contig_names) + "\t" + "\t".join(names) + "\n")
 
     if show_names:
@@ -478,21 +475,19 @@ def to_vcf(df, args, names, outfile, show_names=True,  contig_names="", header=N
 
     count = 0
     recs = []
-    if args is not None:
-        add_kind = args["add_kind"] == "True"
-        if args["metrics"]:
-            small_output_f = False
-        if args["verbosity"] == '0':
-            df["contigA"] = [''] * len(df)
-            df["contigB"] = [''] * len(df)
-        elif args["verbosity"] == '1':
-            has_alt = [True if isinstance(i, str) and i[0] != '<' else False for i in df['variant_seq']]
-            df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
-            df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]
-
-        n_fields = len(col_names(small_output_f)[-1])
-    else:
+
+    add_kind = args["add_kind"] == "True"
+    if args["metrics"]:
         small_output_f = False
+    if args["verbosity"] == '0':
+        df["contigA"] = [''] * len(df)
+        df["contigB"] = [''] * len(df)
+    elif args["verbosity"] == '1':
+        has_alt = [True if (isinstance(i, str) and len(i) >= 1 and i[0] != '<') else False for i in df['variant_seq']]
+        df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
+        df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]
+
+    n_fields = len(col_names(small_output_f)[-1])
 
     for idx, r in df.iterrows():
         if idx in seen_idx:

diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd
@@ -297,4 +297,5 @@ cdef class EventResult:
     cdef public bint preciseA, preciseB, linked, modified, remapped
     cdef public int8_t svlen_precise
     cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \
-        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames, haplotypes
+        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info
+    cdef public object qnames, haplotype, phase_set, a_freq
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
@@ -436,16 +436,8 @@ cdef class EventResult:
         self.n_sa = 0
         self.n_gaps = 0
         self.compress = 0
+        self.a_freq = 0
         self.qnames = set([])
 
     def __repr__(self):
         return str(to_dict(self))
-        # return str(self.to_dict())
-
-    # def __getstate__(self):  # for pickling
-    #     return to_dict(self)
-    #     # return self.to_dict()
-    #
-    # def __setstate__(self, d):
-    #     for k, v in d.items():
-    #         self.__setattr__(k, v)