Skip to content

Commit

Permalink
Added PSET phaseset HP haplotype tag and AF allele frequency tags to …
Browse files Browse the repository at this point in the history
…output
  • Loading branch information
kcleal committed Feb 24, 2025
1 parent 634102f commit c4e1484
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 175 deletions.
57 changes: 38 additions & 19 deletions dysgu/call_component.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,32 @@ cdef base_quals_aligned_clipped(AlignedSegment a):
return aligned_base_quals, aligned_bases, clipped_base_quals, clipped_bases


cdef collect_phase_tags(bint get_hp_tag, AlignedSegment a, EventResult_t er):
if not get_hp_tag:
return
if not a.has_tag("HP"):
if er.haplotype is None or 'u' not in er.haplotype:
er.haplotype = {'u': 1}
else:
er.haplotype['u'] += 1
return
hp_tag = a.get_tag("HP")
if not er.haplotype:
er.haplotype = {hp_tag: 1}
elif hp_tag not in er.haplotype:
er.haplotype[hp_tag] = 1
else:
er.haplotype[hp_tag] += 1
if a.has_tag("PS"):
pg_tag = a.get_tag("PS")
if not er.phase_set:
er.phase_set = {pg_tag: 1}
elif pg_tag not in er.phase_set:
er.phase_set[pg_tag] = 1
else:
er.phase_set[pg_tag] += 1


cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
EventResult_t er, bint paired_end_reads, bint get_hp_tag):
cdef float NMpri = 0
Expand All @@ -145,6 +171,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
cdef float a_bases, large_gaps, n_small_gaps
cdef AlignedSegment a
for index, a in enumerate(itertools.chain(reads1, reads2, [i.read_a for i in generic_ins])):
seen.add(a.qname)
flag = a.flag
if flag & 2:
er.NP += 1
Expand All @@ -162,16 +189,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
n_sa += a.get_tag("SA").count(";")
if a.has_tag("XA"):
n_xa += a.get_tag("XA").count(";")
if get_hp_tag and a.has_tag("HP"):
hp_tag = a.get_tag("HP")
if not er.haplotypes:
er.haplotypes = {hp_tag: 1}
elif hp_tag not in er.haplotypes:
er.haplotypes[hp_tag] = 1
else:
er.haplotypes[hp_tag] += 1

collect_phase_tags(get_hp_tag, a, er)

a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)

if a_bases > 0:
n_gaps += n_small_gaps / a_bases
if flag & 2304: # Supplementary (and not primary if -M if flagged using bwa)
Expand All @@ -190,7 +212,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
total_pri += 1
MAPQpri += a.mapq
if paired_end_reads:
if index >= len(reads2) and a.qname in paired_end:
if index >= len(reads1) and a.qname in paired_end:
er.pe += 1
else:
paired_end.add(a.qname)
Expand All @@ -217,6 +239,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
clipped_bases += cb

for a in spanning:
seen.add(a.qname)
flag = a.flag
if flag & 2:
er.NP += 1
Expand All @@ -229,16 +252,10 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
n_sa += a.get_tag("SA").count(";")
if a.has_tag("XA"):
n_xa += a.get_tag("XA").count(";")
if get_hp_tag and a.has_tag("HP"):
hp_tag = a.get_tag("HP")
if not er.haplotypes:
er.haplotypes = {hp_tag: 1}
elif hp_tag not in er.haplotypes:
er.haplotypes[hp_tag] = 1
else:
er.haplotypes[hp_tag] += 1

collect_phase_tags(get_hp_tag, a, er)
a_bases, large_gaps, n_small_gaps = n_aligned_bases(a)

if a_bases > 0:
n_gaps += n_small_gaps / a_bases
if flag & 2304: # Supplementary
Expand All @@ -257,7 +274,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
total_pri += 1
if paired_end_reads:
if a.qname in paired_end: # If two primary reads from same pair
er.pe += 2
er.pe += 1
else:
paired_end.add(a.qname)
if a.has_tag("NM"):
Expand Down Expand Up @@ -296,6 +313,8 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
else:
er.clip_qual_ratio = 0

er.a_freq += len(seen)


cdef int within_read_end_position(int event_pos, CigarItem cigar_item):
cdef int end
Expand Down Expand Up @@ -918,7 +937,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag):
hp_count += 1
lengths = [s.cigar_item.len for s in hp_spanning]
cluster_lengths(clusters, lengths, hp_spanning, eps)
# Merge near-identical haplotypes
# Merge near-identical haplotype
if len(clusters) > 1:
cl_srt = sorted([ [ np.mean([c.cigar_item.len for c in clst]), clst ] for clst in clusters], key=lambda x: x[0])
merged_haps = [cl_srt[0]]
Expand Down
2 changes: 0 additions & 2 deletions dysgu/cluster.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -511,8 +511,6 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N

preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])

# preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)

preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)

Expand Down
3 changes: 3 additions & 0 deletions dysgu/coverage.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -621,5 +621,8 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov
if r.chrA != r.chrB:
r.svlen = 1000000
r.mcov = max_depth

r.a_freq = r.a_freq / reads_10kb
r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2)
new_events.append(r)
return new_events
47 changes: 21 additions & 26 deletions dysgu/io_funcs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,13 @@ cpdef list col_names(small_output):

if small_output:
return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc",
["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
["GT", "GQ", "phase_set", "haplotype", "a_freq", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase",
"raw_reads_10kb", "neigh10kb", "plus",
"minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"]
]
else:
return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc",
["GT", "GQ", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
["GT", "GQ", "phase_set", "haplotype", "a_freq", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP",
"maxASsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "sqc", "scw", "clip_qual_ratio", "block_edge",
"raw_reads_10kb", "mcov",
"linked", "neigh", "neigh10kb", "ref_bases", "plus",
Expand All @@ -157,11 +157,6 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
rep, repsc, lenprec = 0, 0, 1
mean_prob, max_prob = None, None
debug = False
# if abs(r['posA'] - 39084726) < 10:
# echo('found', dict(r))
# echo(len(format_f) > 1)
# echo([(int(v["su"]), v["posA"], v["event_id"], k) for k, v in df_rows.items()])
# debug = True
if len(format_f) > 1:
best = sorted([(int(v["su"]), k) for k, v in df_rows.items()], reverse=True)[0][1]
probs = [v["prob"] for k, v in df_rows.items()]
Expand Down Expand Up @@ -273,9 +268,9 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small
info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"]

if small_output:
fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
fmt_keys = "GT:GQ:PSET:HP:AF:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB"
else:
fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"
fmt_keys = "GT:GQ:PSET:HP:AF:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"

if "variant_seq" in r and isinstance(r["variant_seq"], str):
if r['svtype'] == "INS" or r.variant_seq:
Expand Down Expand Up @@ -318,13 +313,13 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small

def get_fmt(r, small_output):
if small_output:
v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'],
r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"],
r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob']
]
return v
else:
v = [r["GT"], r["GQ"], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'],
r['MAPQsupp'], r['NP'], r['maxASsupp'], r['su'], r['spanning'], r['pe'], r['supp'],
r['sc'], r['bnd'], round(r['sqc'], 2), round(r['scw'], 1), round(r['clip_qual_ratio'], 3), r['block_edge'], r['raw_reads_10kb'], round(r['mcov'], 2), int(r['linked']), r['neigh'], r['neigh10kb'],
r['ref_bases'], r["plus"], r["minus"], round(r["strand_binom_t"], 4), r['n_gaps'], round(r["n_sa"], 2),
Expand Down Expand Up @@ -407,6 +402,9 @@ def get_header(contig_names=""):
##ALT=<ID=TRA,Description="Translocation">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality phred scaled">
##FORMAT=<ID=PSET,Number=1,Type=String,Description="Phase-set ID for phased SVs">
##FORMAT=<ID=HP,Number=1,Type=String,Description="Phased read support HP1[|HP2|...HPn]_unphased. Leading underscore (e.g. _4) indicates all reads unphased. No underscore implies no unphased reads">
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
##FORMAT=<ID=NMP,Number=1,Type=Float,Description="Mean edit distance for primary alignments supporting the variant">
##FORMAT=<ID=NMS,Number=1,Type=Float,Description="Mean edit distance for supplementary alignments supporting the variant">
##FORMAT=<ID=NMB,Number=1,Type=Float,Description="Mean basic, edit distance. Gaps >= 30 bp are ignored">
Expand Down Expand Up @@ -460,7 +458,6 @@ def get_header(contig_names=""):
def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=None,
small_output_f=True, sort_output=True, n_fields=None):


outfile.write(get_header(contig_names) + "\t" + "\t".join(names) + "\n")

if show_names:
Expand All @@ -478,21 +475,19 @@ def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=N

count = 0
recs = []
if args is not None:
add_kind = args["add_kind"] == "True"
if args["metrics"]:
small_output_f = False
if args["verbosity"] == '0':
df["contigA"] = [''] * len(df)
df["contigB"] = [''] * len(df)
elif args["verbosity"] == '1':
has_alt = [True if isinstance(i, str) and i[0] != '<' else False for i in df['variant_seq']]
df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]

n_fields = len(col_names(small_output_f)[-1])
else:

add_kind = args["add_kind"] == "True"
if args["metrics"]:
small_output_f = False
if args["verbosity"] == '0':
df["contigA"] = [''] * len(df)
df["contigB"] = [''] * len(df)
elif args["verbosity"] == '1':
has_alt = [True if (isinstance(i, str) and len(i) >= 1 and i[0] != '<') else False for i in df['variant_seq']]
df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)]
df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)]

n_fields = len(col_names(small_output_f)[-1])

for idx, r in df.iterrows():
if idx in seen_idx:
Expand Down
3 changes: 2 additions & 1 deletion dysgu/map_set_utils.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -297,4 +297,5 @@ cdef class EventResult:
cdef public bint preciseA, preciseB, linked, modified, remapped
cdef public int8_t svlen_precise
cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \
partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames, haplotypes
partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info
cdef public object qnames, haplotype, phase_set, a_freq
10 changes: 1 addition & 9 deletions dysgu/map_set_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -436,16 +436,8 @@ cdef class EventResult:
self.n_sa = 0
self.n_gaps = 0
self.compress = 0
self.a_freq = 0
self.qnames = set([])

def __repr__(self):
return str(to_dict(self))
# return str(self.to_dict())

# def __getstate__(self): # for pickling
# return to_dict(self)
# # return self.to_dict()
#
# def __setstate__(self, d):
# for k, v in d.items():
# self.__setattr__(k, v)
Loading

0 comments on commit c4e1484

Please sign in to comment.