From dc0ff467ff128abc40f6e1b3090ac58f4c2d96d9 Mon Sep 17 00:00:00 2001 From: Kez Cleal <42997789+kcleal@users.noreply.github.com> Date: Fri, 31 Jan 2025 10:59:43 +0000 Subject: [PATCH 1/8] Dysgu_dev <-- master (#121) * Dysgu dev (#115) * dysgu_dev <- master (#108) * correct usage of update_filter_value in filter_normals It seems like at some point the usage of update_filter_value changed, because in several spots, it is missing the sample_name argument. This breaks `dysgu filter --keep-all`. * Update main.yml * Update main.yml * Update requirements.txt * updated build process to pyproject.toml (#103) * v1.6.6 updated README * Update main.yml * Added pyproject.toml * Updated pyproject.toml and setup.py * Update pyproject.toml --------- Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com> * Better recall+precision for long-reads. Improved merging pipeline. Faster runtime. Code refactoring. WIP # [skip ci] * Fixed issues with overriding presets. --mode option updated # [skip ci] * Fixed issues with overriding presets. --mode option updated to support r10 and revio, new presets available. --no-gt flat removed # [skip ci] * Fixed CLI issues for tests # [skip ci] * Fixed API issue * v1.7.0 * added deprecated warning for mode * Added filter for high divergence reads (long-reads only) # [skip ci] * Better read merging for long reads. New clustering methods for long reads. Improved runtime. * Update main.yml * Update main.yml * Update osx-deps * Update main.yml macos11 * Update main.yml * Update main.yml * Update main.yml * Update osx-deps * Update osx-deps * Update osx-deps * Update osx-deps * Update main.yml * Update osx-deps * Update main.yml * Updated workflow * Updated workflow * Updated workflow * Update main.yml * Updated workflow --------- Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com> * Fixes compile error for clang * Fixed merge None type error. Added wb3 as default compression level for long-reads (run-pipeline) (#116) * Updated workflow * Updated workflow * Fixed compiler warning --------- Co-authored-by: Dr. K. D. Murray <1560490+kdm9@users.noreply.github.com> --- .github/workflows/main.yml | 2 -- dysgu/call_component.pyx | 18 ++++++++---------- dysgu/graph.pyx | 4 ++-- dysgu/map_set_utils.pxd | 4 ++-- dysgu/map_set_utils.pyx | 12 ++++++------ setup.py | 1 - 6 files changed, 18 insertions(+), 23 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 33b9b6f..bf6a5fa 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,8 +11,6 @@ jobs: strategy: matrix: os: [ubuntu-20.04, macOS-14] - #os: [ubuntu-latest, macOS-11] - steps: - uses: actions/checkout@v4 - name: Set correct paths for Homebrew on macOS diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 94a608d..71b3a84 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -105,7 +105,7 @@ cdef base_quals_aligned_clipped(AlignedSegment a): cdef float clipped_base_quals = 0 cdef int left_clip = 0 cdef int right_clip = 0 - clip_sizes(a, left_clip, right_clip) + clip_sizes(a, &left_clip, &right_clip) clipped_bases = left_clip + right_clip cdef const unsigned char[:] quals = a.query_qualities cdef int i @@ -154,7 +154,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, er.n_unmapped_mates += 1 left_clip = 0 right_clip = 0 - clip_sizes_hard(a, left_clip, right_clip) + clip_sizes_hard(a, &left_clip, &right_clip) if left_clip > 0 and right_clip > 0: er.double_clips += 1 has_sa = a.has_tag("SA") @@ -206,7 +206,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, left_clip = 0 right_clip = 0 - clip_sizes(a, left_clip, right_clip) + clip_sizes(a, &left_clip, &right_clip) if left_clip or right_clip: er.sc += 1 if a.flag & 1: # paired read @@ -222,7 +222,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, er.NP += 1 left_clip = 0 right_clip = 0 - clip_sizes_hard(a, left_clip, right_clip) + clip_sizes_hard(a, &left_clip, &right_clip) if left_clip > 0 and right_clip > 0: er.double_clips += 1 if a.has_tag("SA"): @@ -507,7 +507,7 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std): v_item.svtype = "BND" left_clip = 0 right_clip = 0 - clip_sizes(aln, left_clip, right_clip) + clip_sizes(aln, &left_clip, &right_clip) clip_s = max(left_clip, right_clip) rand_insert_pos = 100 if not clip_s else clip_s v_item.inferred_sv_len = 0 if rand_insert_pos < 0 else rand_insert_pos @@ -945,6 +945,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag): def process_spanning(bint paired_end, spanning_alignments, float divergence, length_extend, informative, generic_insertions, float insert_ppf, bint to_assemble, bint hp_tag): + # echo("PROCESS SPANNING") cdef int min_found_support = 0 cdef str svtype, jointype @@ -996,6 +997,7 @@ def process_spanning(bint paired_end, spanning_alignments, float divergence, len # 1.7.0 # svlen = int(np.median([sp.cigar_item.len for sp in spanning_alignments])) + posA = spanning_alignments[best_index].pos posB = spanning_alignments[best_index].end er.preciseA = True @@ -2020,8 +2022,7 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, for (u, v), d in data.s_between: #.items(): rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info) # [(Nodeinfo, alignment)..] rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info) - # echo("rd_u", [rr[1].qname for rr in rd_u]) - # echo("rd_v", [rr[1].qname for rr in rd_v]) + total_reads = len(rd_u) + len(rd_v) buffered_reads += total_reads if add_to_buffer and buffered_reads > 50000: @@ -2106,9 +2107,6 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std n_parts = len(data.parts) if data.parts else 0 events = [] info = data.info - # echo(data.parts) - # echo(data.s_between) - # echo(data.s_within) if data.reads is None: data.reads = {} # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index 78a0e6e..cd0c984 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -252,7 +252,7 @@ cdef class ClipScoper: unordered_set[int]& clustered_nodes): cdef int clip_left = 0 cdef int clip_right = 0 - clip_sizes(r, clip_left, clip_right) + clip_sizes(r, &clip_left, &clip_right) if chrom != self.current_chrom: self.scope_left.clear() self.scope_right.clear() @@ -1368,7 +1368,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering pos2 = -1 left_clip_size = 0 right_clip_size = 0 - clip_sizes_hard(r, left_clip_size, right_clip_size) # soft and hard-clips + clip_sizes_hard(r, &left_clip_size, &right_clip_size) # soft and hard-clips if r.flag & 8 and clipped: # paired by inference # skip if both ends are clipped, usually means its a chunk of badly mapped sequence # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ): diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd index 2dcb72b..866a729 100644 --- a/dysgu/map_set_utils.pxd +++ b/dysgu/map_set_utils.pxd @@ -266,9 +266,9 @@ cdef extern from "" namespace "std" nogil: cdef int cigar_exists(AlignedSegment r) -cdef void clip_sizes(AlignedSegment r, int& left, int& right) +cdef void clip_sizes(AlignedSegment r, int* left, int* right) -cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right) +cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right) cdef int cigar_clip(AlignedSegment r, int clip_length) diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx index 4b69999..e6ab2e0 100644 --- a/dysgu/map_set_utils.pyx +++ b/dysgu/map_set_utils.pyx @@ -223,7 +223,7 @@ cdef int cigar_exists(AlignedSegment r): return 0 -cdef void clip_sizes(AlignedSegment r, int& left, int& right): +cdef void clip_sizes(AlignedSegment r, int* left, int* right): cdef uint32_t cigar_value cdef uint32_t cigar_l cdef uint32_t *cigar_p @@ -235,14 +235,14 @@ cdef void clip_sizes(AlignedSegment r, int& left, int& right): cigar_value = cigar_p[0] opp = cigar_value & 15 if opp == 4: - left = cigar_value >> 4 + left[0] = cigar_value >> 4 cigar_value = cigar_p[cigar_l - 1] opp = cigar_value & 15 if opp == 4: - right = cigar_value >> 4 + right[0] = cigar_value >> 4 -cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right): +cdef void clip_sizes_hard(AlignedSegment r, int* left, int* right): cdef uint32_t cigar_value cdef uint32_t cigar_l cdef uint32_t *cigar_p @@ -255,11 +255,11 @@ cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right): cigar_value = cigar_p[0] opp = cigar_value & 15 if opp == 4 or opp == 5: - left = cigar_value >> 4 + left[0] = cigar_value >> 4 cigar_value = cigar_p[cigar_l - 1] opp = cigar_value & 15 if opp == 4 or opp == 5: - right = cigar_value >> 4 + right[0] = cigar_value >> 4 cdef int cigar_clip(AlignedSegment r, int clip_length): diff --git a/setup.py b/setup.py index ee20dde..4fdfa27 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,6 @@ def get_extra_args(): extras = get_extra_args() + ["-Wno-sign-compare", "-Wno-unused-function", "-Wno-unused-result", '-Wno-ignored-qualifiers', "-Wno-deprecated-declarations", "-fpermissive", - "-Wno-unreachable-code-fallthrough", ] def get_extension_modules(): From 5702658354c2554b6017be493561592799336ccc Mon Sep 17 00:00:00 2001 From: kcleal Date: Fri, 31 Jan 2025 11:33:22 +0000 Subject: [PATCH 2/8] Fixed compile errors. Add logging for HP tag usage --- dysgu/call_component.pyx | 7 ++++--- dysgu/cluster.pyx | 3 +++ dysgu/extra_metrics.pyx | 2 +- dysgu/map_set_utils.pyx | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 71b3a84..39a850c 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -1459,8 +1459,8 @@ cdef tuple mask_soft_clips(AlignedSegment a, AlignedSegment b): cdef int left_clipB = 0 cdef int right_clipB = 0 - clip_sizes_hard(a, left_clipA, right_clipA) - clip_sizes_hard(b, left_clipB, right_clipB) + clip_sizes_hard(a, &left_clipA, &right_clipA) + clip_sizes_hard(b, &left_clipB, &right_clipB) cdef int a_template_start = 0 cdef int b_template_start = 0 @@ -1643,7 +1643,8 @@ cdef call_from_reads(u_reads_info, v_reads_info, int insert_size, int insert_std cdef AlignedSegment a, b cdef uint32_t cigar_l_a, cigar_l_b - cdef uint32_t *cigar_p_a, *cigar_p_b + cdef uint32_t *cigar_p_a + cdef uint32_t *cigar_p_b for qname in [k for k in grp_u if k in grp_v]: # Qname found on both sides diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index ad3aea4..1b21d90 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -249,6 +249,9 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N sites_index = sites_adder.sites_index logging.info("Graph constructed") + if not args['no_phase'] and hp_tag_found: + logging.info("Using HP tag") + auto_support = False if args["min_support"] != "auto": args["min_support"] = int(args["min_support"]) diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx index 80d7160..aabf72e 100644 --- a/dysgu/extra_metrics.pyx +++ b/dysgu/extra_metrics.pyx @@ -95,7 +95,7 @@ cdef float soft_clip_qual_corr(reads): quals = r.query_qualities left_clip = 0 right_clip = 0 - clip_sizes(r, left_clip, right_clip) + clip_sizes(r, &left_clip, &right_clip) if quals is None: continue diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx index e6ab2e0..c882661 100644 --- a/dysgu/map_set_utils.pyx +++ b/dysgu/map_set_utils.pyx @@ -269,7 +269,7 @@ cdef int cigar_clip(AlignedSegment r, int clip_length): return 0 cdef int left = 0 cdef int right = 0 - clip_sizes(r, left, right) + clip_sizes(r, &left, &right) if left >= clip_length or right >= clip_length: return 1 return 0 From 634102f3bb75310cdc63a62714aa8f97e3161413 Mon Sep 17 00:00:00 2001 From: kcleal Date: Fri, 31 Jan 2025 11:50:35 +0000 Subject: [PATCH 3/8] Added superintervals dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 390e8d1..91c0451 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "networkx >= 2.4", "scikit-learn >= 0.22", "sortedcontainers", + "superintervals", "lightgbm" ] From c4e1484a985aa50d246e8037b23c3d7faf2ff4c8 Mon Sep 17 00:00:00 2001 From: kcleal Date: Mon, 24 Feb 2025 13:05:56 +0000 Subject: [PATCH 4/8] Added PSET phaseset HP haplotype tag and AF allele frequency tags to output --- dysgu/call_component.pyx | 57 +++++--- dysgu/cluster.pyx | 2 - dysgu/coverage.pyx | 3 + dysgu/io_funcs.pyx | 47 +++---- dysgu/map_set_utils.pxd | 3 +- dysgu/map_set_utils.pyx | 10 +- dysgu/post_call.py | 273 ++++++++++++++++++++++----------------- dysgu/view.py | 3 + 8 files changed, 223 insertions(+), 175 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 39a850c..0d5ebcf 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -119,6 +119,32 @@ cdef base_quals_aligned_clipped(AlignedSegment a): return aligned_base_quals, aligned_bases, clipped_base_quals, clipped_bases +cdef collect_phase_tags(bint get_hp_tag, AlignedSegment a, EventResult_t er): + if not get_hp_tag: + return + if not a.has_tag("HP"): + if er.haplotype is None or 'u' not in er.haplotype: + er.haplotype = {'u': 1} + else: + er.haplotype['u'] += 1 + return + hp_tag = a.get_tag("HP") + if not er.haplotype: + er.haplotype = {hp_tag: 1} + elif hp_tag not in er.haplotype: + er.haplotype[hp_tag] = 1 + else: + er.haplotype[hp_tag] += 1 + if a.has_tag("PS"): + pg_tag = a.get_tag("PS") + if not er.phase_set: + er.phase_set = {pg_tag: 1} + elif pg_tag not in er.phase_set: + er.phase_set[pg_tag] = 1 + else: + er.phase_set[pg_tag] += 1 + + cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, EventResult_t er, bint paired_end_reads, bint get_hp_tag): cdef float NMpri = 0 @@ -145,6 +171,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, cdef float a_bases, large_gaps, n_small_gaps cdef AlignedSegment a for index, a in enumerate(itertools.chain(reads1, reads2, [i.read_a for i in generic_ins])): + seen.add(a.qname) flag = a.flag if flag & 2: er.NP += 1 @@ -162,16 +189,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, n_sa += a.get_tag("SA").count(";") if a.has_tag("XA"): n_xa += a.get_tag("XA").count(";") - if get_hp_tag and a.has_tag("HP"): - hp_tag = a.get_tag("HP") - if not er.haplotypes: - er.haplotypes = {hp_tag: 1} - elif hp_tag not in er.haplotypes: - er.haplotypes[hp_tag] = 1 - else: - er.haplotypes[hp_tag] += 1 + + collect_phase_tags(get_hp_tag, a, er) a_bases, large_gaps, n_small_gaps = n_aligned_bases(a) + if a_bases > 0: n_gaps += n_small_gaps / a_bases if flag & 2304: # Supplementary (and not primary if -M if flagged using bwa) @@ -190,7 +212,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, total_pri += 1 MAPQpri += a.mapq if paired_end_reads: - if index >= len(reads2) and a.qname in paired_end: + if index >= len(reads1) and a.qname in paired_end: er.pe += 1 else: paired_end.add(a.qname) @@ -217,6 +239,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, clipped_bases += cb for a in spanning: + seen.add(a.qname) flag = a.flag if flag & 2: er.NP += 1 @@ -229,16 +252,10 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, n_sa += a.get_tag("SA").count(";") if a.has_tag("XA"): n_xa += a.get_tag("XA").count(";") - if get_hp_tag and a.has_tag("HP"): - hp_tag = a.get_tag("HP") - if not er.haplotypes: - er.haplotypes = {hp_tag: 1} - elif hp_tag not in er.haplotypes: - er.haplotypes[hp_tag] = 1 - else: - er.haplotypes[hp_tag] += 1 + collect_phase_tags(get_hp_tag, a, er) a_bases, large_gaps, n_small_gaps = n_aligned_bases(a) + if a_bases > 0: n_gaps += n_small_gaps / a_bases if flag & 2304: # Supplementary @@ -257,7 +274,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, total_pri += 1 if paired_end_reads: if a.qname in paired_end: # If two primary reads from same pair - er.pe += 2 + er.pe += 1 else: paired_end.add(a.qname) if a.has_tag("NM"): @@ -296,6 +313,8 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, else: er.clip_qual_ratio = 0 + er.a_freq += len(seen) + cdef int within_read_end_position(int event_pos, CigarItem cigar_item): cdef int end @@ -918,7 +937,7 @@ cdef linear_scan_clustering(spanning, bint hp_tag): hp_count += 1 lengths = [s.cigar_item.len for s in hp_spanning] cluster_lengths(clusters, lengths, hp_spanning, eps) - # Merge near-identical haplotypes + # Merge near-identical haplotype if len(clusters) > 1: cl_srt = sorted([ [ np.mean([c.cigar_item.len for c in clst]), clst ] for clst in clusters], key=lambda x: x[0]) merged_haps = [cl_srt[0]] diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 1b21d90..6ce56ad 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -511,8 +511,6 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"]) - # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries) - preliminaries = extra_metrics.sample_level_density(preliminaries, regions) preliminaries = coverage_analyser.normalize_coverage_values(preliminaries) diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 47e0f37..32ca3e5 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -621,5 +621,8 @@ def get_raw_coverage_information(events, regions, regions_depth, infile, max_cov if r.chrA != r.chrB: r.svlen = 1000000 r.mcov = max_depth + + r.a_freq = r.a_freq / reads_10kb + r.a_freq = round(max(0, min(r.a_freq, 1.0)), 2) new_events.append(r) return new_events diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx index 4ac0bea..4d52d56 100644 --- a/dysgu/io_funcs.pyx +++ b/dysgu/io_funcs.pyx @@ -138,13 +138,13 @@ cpdef list col_names(small_output): if small_output: return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc", - ["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", + ["GT", "GQ", "phase_set", "haplotype", "a_freq", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", "raw_reads_10kb", "neigh10kb", "plus", "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"] ] else: return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc", - ["GT", "GQ", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP", + ["GT", "GQ", "phase_set", "haplotype", "a_freq", "NMpri", "NMsupp", "NMbase", "MAPQpri", "MAPQsupp", "NP", "maxASsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "sqc", "scw", "clip_qual_ratio", "block_edge", "raw_reads_10kb", "mcov", "linked", "neigh", "neigh10kb", "ref_bases", "plus", @@ -157,11 +157,6 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small rep, repsc, lenprec = 0, 0, 1 mean_prob, max_prob = None, None debug = False - # if abs(r['posA'] - 39084726) < 10: - # echo('found', dict(r)) - # echo(len(format_f) > 1) - # echo([(int(v["su"]), v["posA"], v["event_id"], k) for k, v in df_rows.items()]) - # debug = True if len(format_f) > 1: best = sorted([(int(v["su"]), k) for k, v in df_rows.items()], reverse=True)[0][1] probs = [v["prob"] for k, v in df_rows.items()] @@ -273,9 +268,9 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"] if small_output: - fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" + fmt_keys = "GT:GQ:PSET:HP:AF:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" else: - fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB" + fmt_keys = "GT:GQ:PSET:HP:AF:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB" if "variant_seq" in r and isinstance(r["variant_seq"], str): if r['svtype'] == "INS" or r.variant_seq: @@ -318,13 +313,13 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small def get_fmt(r, small_output): if small_output: - v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], + v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"], r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob'] ] return v else: - v = [r["GT"], r["GQ"], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'], + v = [r["GT"], r["GQ"], r["phase_set"], r["haplotype"], r['a_freq'], r['NMpri'], r['NMsupp'], r['NMbase'], r['MAPQpri'], r['MAPQsupp'], r['NP'], r['maxASsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], round(r['sqc'], 2), round(r['scw'], 1), round(r['clip_qual_ratio'], 3), r['block_edge'], r['raw_reads_10kb'], round(r['mcov'], 2), int(r['linked']), r['neigh'], r['neigh10kb'], r['ref_bases'], r["plus"], r["minus"], round(r["strand_binom_t"], 4), r['n_gaps'], round(r["n_sa"], 2), @@ -407,6 +402,9 @@ def get_header(contig_names=""): ##ALT= ##FORMAT= ##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT== 30 bp are ignored"> @@ -460,7 +458,6 @@ def get_header(contig_names=""): def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=None, small_output_f=True, sort_output=True, n_fields=None): - outfile.write(get_header(contig_names) + "\t" + "\t".join(names) + "\n") if show_names: @@ -478,21 +475,19 @@ def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=N count = 0 recs = [] - if args is not None: - add_kind = args["add_kind"] == "True" - if args["metrics"]: - small_output_f = False - if args["verbosity"] == '0': - df["contigA"] = [''] * len(df) - df["contigB"] = [''] * len(df) - elif args["verbosity"] == '1': - has_alt = [True if isinstance(i, str) and i[0] != '<' else False for i in df['variant_seq']] - df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)] - df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)] - - n_fields = len(col_names(small_output_f)[-1]) - else: + + add_kind = args["add_kind"] == "True" + if args["metrics"]: small_output_f = False + if args["verbosity"] == '0': + df["contigA"] = [''] * len(df) + df["contigB"] = [''] * len(df) + elif args["verbosity"] == '1': + has_alt = [True if (isinstance(i, str) and len(i) >= 1 and i[0] != '<') else False for i in df['variant_seq']] + df["contigA"] = ['' if a else c for c, a in zip(df['contigA'], has_alt)] + df["contigB"] = ['' if a else c for c, a in zip(df['contigB'], has_alt)] + + n_fields = len(col_names(small_output_f)[-1]) for idx, r in df.iterrows(): if idx in seen_idx: diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd index 866a729..f27962a 100644 --- a/dysgu/map_set_utils.pxd +++ b/dysgu/map_set_utils.pxd @@ -297,4 +297,5 @@ cdef class EventResult: cdef public bint preciseA, preciseB, linked, modified, remapped cdef public int8_t svlen_precise cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \ - partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames, haplotypes + partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info + cdef public object qnames, haplotype, phase_set, a_freq diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx index c882661..fec01cc 100644 --- a/dysgu/map_set_utils.pyx +++ b/dysgu/map_set_utils.pyx @@ -436,16 +436,8 @@ cdef class EventResult: self.n_sa = 0 self.n_gaps = 0 self.compress = 0 + self.a_freq = 0 self.qnames = set([]) def __repr__(self): return str(to_dict(self)) - # return str(self.to_dict()) - - # def __getstate__(self): # for pickling - # return to_dict(self) - # # return self.to_dict() - # - # def __setstate__(self, d): - # for k, v in d.items(): - # self.__setattr__(k, v) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index cdfa231..931389e 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -9,8 +9,8 @@ import glob import gzip import pandas as pd -# import networkx as nx -# from itertools import combinations +import networkx as nx +from itertools import combinations pd.options.mode.chained_assignment = None from dysgu.scikitbio._ssw_wrapper import StripedSmithWaterman from dysgu.consensus import compute_rep @@ -324,6 +324,57 @@ def median(arr, start, end): return -1 +def get_ref_base(events, ref_genome, symbolic_sv_size): + for e in events: + if e.posA == 0: + e.posA = 1 # !! + if not e.ref_seq and not e.svtype == "BND": + if symbolic_sv_size == -1 or e.svtype in ("INS", "TRA", "INV"): + if e.posA == 0: + e.posA = 1 + try: + base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() + e.ref_seq = base + except: + pass + else: + bases = "" + if e.svlen < symbolic_sv_size: + start = e.posA + end = e.posB + else: + start = e.posA - 1 + end - e.posA + try: + bases = ref_genome.fetch(e.chrA, start, end).upper() + except: + pass + if e.svtype == "DEL": + e.ref_seq = bases + e.variant_seq = bases[0] if bases else "" + else: + e.variant_seq = bases + e.ref_seq = bases[0] if bases else "" + + # + # + # + # try: + # bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper() + # + # except: + # pass + # e.ref_seq = bases + # e.variant_seq = bases[0] if len(bases) else "" + # else: + # try: + # bases = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() + # except: + # pass + # e.ref_seq = bases + return events + + def nCk(n,k): f = math.factorial return f(n) // f(k) // f(n-k) @@ -352,38 +403,6 @@ def strand_binom_t(events): return events -def get_ref_base(events, ref_genome, symbolic_sv_size): - for e in events: - if not e.ref_seq and not e.svtype == "BND": - if symbolic_sv_size == -1 or e.svtype == "INS" or e.svtype == "TRA": - if e.posA == 0: - e.posA = 1 - try: - base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() - e.ref_seq = base - except: - pass - else: - if e.svlen < symbolic_sv_size: - if e.posA == 0: - e.posA = 1 - try: - bases = ref_genome.fetch(e.chrA, e.posA, e.posB).upper() - e.ref_seq = bases - e.variant_seq = bases[0] if len(bases) else "" - except: - pass - else: - if e.posA == 0: - e.posA = 1 - try: - base = ref_genome.fetch(e.chrA, e.posA - 1, e.posA).upper() - e.ref_seq = base - except: - pass - return events - - # from svtyper with minor modifications # https://github.com/hall-lab/svtyper/blob/master/svtyper/singlesample.py # efficient combinatorial function to handle extremely large numbers @@ -593,33 +612,102 @@ def is_reciprocal_overlapping(x1,x2, y1,y2): e2.GT = "0/1" -# def phase_haplotypes(events): -# qname_2_eventidx = defaultdict(set) -# for idx, e in enumerate(events): -# if e.qnames and e.GT.replace('/', '|') != '1|1' and e.GT[0] != '.': -# for qname in e.qnames: -# qname_2_eventidx[qname].add(idx) -# -# G = nx.Graph() -# for qname, idxs in qname_2_eventidx.items(): -# for u, v in combinations(idxs, 2): -# if G.has_edge(u, v): -# G[u][v]['weight'] += 1 -# else: -# G.add_edge(u, v, weight=1) -# bad_edges = [] -# for e in G.edges(data=True): -# if e[2]['weight'] < 2: -# bad_edges.append(e) -# G.remove_edges_from(bad_edges) -# echo('G', len(G.edges())) -# for component in nx.connected_components(G): -# echo('component', component) -# gts = [events[idx].GT for idx in component] -# echo('gts', gts) -# echo(list(f'{e.chrA}:{e.posA}({e.svlen})' for e in [events[idx] for idx in component])) -# sub = G.subgraph(component) -# echo(sub.edges(data=True)) +def low_ps_support(r, support_fraction=0.1): + min_support = round(1.5 + support_fraction * r.su) + return min_support + + + +def join_phase_sets(events, ps_id): + # Join phase sets if support is greater than threshold + G = nx.Graph() + for idx, e in enumerate(events): + if not e.phase_set: + e.phase_set = '' + continue + if len(e.phase_set) > 1: + if e.GT not in '1|11/1': # only heterozygous variants + threshold = low_ps_support(e) + passing_ps = [k for k, v in e.phase_set.items() if v >= threshold] + e.phase_set = passing_ps + if len(passing_ps) > 1: + G.add_edges_from(combinations(passing_ps, 2)) + else: + # Choose PS with maximum support + e.phase_set = str(max(e.phase_set, key=e.phase_set.get)) + else: + e.phase_set = str(list(e.phase_set.keys())[0]) + + new_phase_set = {} + for sub in nx.connected_components(G): + for n in sub: + new_phase_set[n] = str(ps_id) + ps_id += 1 + + for e in events: + if not e.phase_set or isinstance(e.phase_set, str): + continue + assert isinstance(e.phase_set, list) + new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0] + if not new_p: + e.phase_set = '' + else: + e.phase_set = str(new_p) + + +def get_hp_format(events): + max_ps = 0 + any_phase_set = False + keys = set([]) + for e in events: + if e.haplotype: + for k in e.haplotype: + if k != 'u': + keys.add(k) + keys = sorted(list(keys)) + for e in events: + if e.haplotype: + all_haps = e.haplotype + phased_haps = {k: v for k, v in e.haplotype.items() if k != 'u'} + n_haps = len(phased_haps) + n_phase_set = len(e.phase_set) if e.phase_set else 0 + if n_phase_set: + ps = max(e.phase_set, key=e.phase_set.get) + if ps > max_ps: + max_ps = ps + any_phase_set = True + else: + e.phase_set = "" + if n_haps >= 2: + if e.GT == "1/1": + e.GT = "1|1" + else: + best_hap = max(phased_haps, key=phased_haps.get) + if best_hap == 1: + e.GT = "1|0" + else: + e.GT = "0|1" + elif n_haps == 1: + if 1 in e.haplotype: + e.GT = "1|0" + else: + e.GT = "0|1" + + hp_string = '' + if n_haps >= 1: + unphased = all_haps['u'] if 'u' in all_haps else 0 + hp_string = '|'.join([str(phased_haps[k]) if k in phased_haps else '0' for k in keys]) + if unphased: + hp_string += f'_{unphased}' + if not hp_string: + hp_string = "." + + e.haplotype = hp_string + else: + e.haplotype = "." + + + return max_ps, any_phase_set def get_gt_metric2(events, mode, add_gt=True): @@ -677,68 +765,17 @@ def get_gt_metric2(events, mode, add_gt=True): fix_inconsistent_gt(events) - # any_haps = False - for e in events: - if e.haplotypes: - # any_haps = True - n_haps = len(e.haplotypes) - if n_haps >= 2: - if e.GT == "1/1": - e.GT = "1|1" - else: - best_hap = max(e.haplotypes, key=e.haplotypes.get) - if best_hap == 1: - e.GT = "1|0" - else: - e.GT = "0|1" - elif n_haps == 1: - if 1 in e.haplotypes: - e.GT = "1|0" - else: - e.GT = "0|1" + max_ps, any_phase_set = get_hp_format(events) - # phase_haplotypes(events) + if any_phase_set: + join_phase_sets(events, max_ps + 1) + else: + for e in events: + e.phase_set = "." return events -# def filter_microsatellite_non_diploid(potential): -# tmp_list = defaultdict(list) -# max_dist = 50 -# half_d = (max_dist * 0.5) -# candidates = [] -# for idx in range(len(potential)): -# ei = potential[idx] -# if (ei.svtype != "DEL" and ei.svtype != "INS") or not ei.variant_seq: -# continue -# rep = compute_rep(ei.variant_seq) -# if rep < 0.4: -# continue -# candidates.append(idx) -# tmp_list[ei.chrA].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx)) -# if ei.chrA == ei.chrB and ei.svlen > half_d: -# tmp_list[ei.chrA].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx)) -# -# nc2 = {k: iitree(v, add_value=True) for k, v in tmp_list.items()} -# seen = set([]) -# to_drop = set([]) -# for idx in candidates: -# if idx in seen: -# continue -# ei = potential[idx] -# ols = set(nc2[ei.chrA].allOverlappingIntervals(ei.posA, ei.posA + 1)) -# ols2 = set(nc2[ei.chrB].allOverlappingIntervals(ei.posB, ei.posB + 1)) -# ols |= ols2 -# ols.add(idx) -# if len(ols) <= 2: -# continue -# bad = sorted([(i, potential[i]) for i in ols], key=lambda x: x[1].su, reverse=True)[2:] -# for jdx, p in bad: -# to_drop.add(jdx) -# seen |= ols -# return [p for i, p in enumerate(potential) if i not in to_drop] - - def compressability(events): for e in events: c1 = [] diff --git a/dysgu/view.py b/dysgu/view.py index 8fdf428..ca04718 100644 --- a/dysgu/view.py +++ b/dysgu/view.py @@ -362,6 +362,9 @@ def vcf_to_df(path): "JIT": ("jitter", float), "LEFT_SVINSSEQ": ("left_ins_seq", str), "RIGHT_SVINSSEQ": ("right_ins_seq", str), + "PSET": ("phase_set", str), + "HP": ("haplotype", str), + "AF": ("a_freq", float), } # df = df[df.posA == 110156314] df.rename(columns={k: v[0] for k, v in col_map.items()}, inplace=True) From 672a90fd1be98273cd41742b3f118adbd2344023 Mon Sep 17 00:00:00 2001 From: kcleal Date: Mon, 24 Feb 2025 14:20:02 +0000 Subject: [PATCH 5/8] Fixed index error --- dysgu/post_call.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 931389e..c1119b9 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -648,11 +648,11 @@ def join_phase_sets(events, ps_id): if not e.phase_set or isinstance(e.phase_set, str): continue assert isinstance(e.phase_set, list) - new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set]))[0] + new_p = list(set([new_phase_set[n] for n in e.phase_set if n in new_phase_set])) if not new_p: e.phase_set = '' else: - e.phase_set = str(new_p) + e.phase_set = str(new_p[0]) def get_hp_format(events): From f86145b17a2eda41d2e6543503338ae643d8750a Mon Sep 17 00:00:00 2001 From: kcleal Date: Wed, 26 Feb 2025 10:16:11 +0000 Subject: [PATCH 6/8] Changed empty PSET and HP to 0 --- README.rst | 30 ++++++++++++++++++++++-------- dysgu/post_call.py | 6 +++--- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index 977a918..c79c873 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,7 @@ for calling structural variants using paired-end or long read sequencing data. S ⚙️ Installation --------------- -Dysgu can be installed via pip using Python 3.10 - 3.12 and has been tested on linux and MacOS. +Dysgu can be installed via pip using Python >=3.10 and has been tested on linux and MacOS. The list of python packages needed can be found in requirements.txt. To install:: @@ -79,7 +79,8 @@ For help use:: dysgu --help dysgu command --help -To use the python-API see the `documentation `_, or `jupyter notebook `_, +To use the python-API see the `documentation `_, +or `jupyter notebook `_, 🎯 Calling SVs @@ -87,13 +88,16 @@ To use the python-API see the `documentation svs.vcf -This will first run the `fetch` command that will create a temporary bam file plus other analysis files in the directory `temp_dir`. These temporary files are then analysed using the `call` program. +This will first run the `fetch` command that will create a temporary bam file plus other analysis files +in the directory `temp_dir`. These temporary files are then analysed using the `call` program. To make use of multiprocessing, set the "-p" parameter:: dysgu run -p4 reference.fa temp_dir input.bam > svs.vcf @@ -104,14 +108,24 @@ Dysgu also accepts reads from stdin. In this example, the --clean flag will remo Long reads ********** -Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if starting with a bam file, or 'run' if starting with a cram:: +Dysgy is designed to work with long reads aligned using minimap2 or ngmlr. Use the 'call' pipeline if +starting with a bam file, or 'run' if starting with a cram:: dysgu call --mode pacbio-revio reference.fa temp_dir input.bam > svs.vcf dysgu call --mode nanopore-r10 reference.fa temp_dir input.bam > svs.vcf -Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`, and ONT `nanopore-r9 | nanopore-r10` -If you are using using reads with higher error rates, or are unsure of the read-accuracy, it is recommended to set '--divergence auto', to infer a more conservative sequence divergence. The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in more reads being filtered and lower sensitivity:: +Presets are available using the `--mode` option for PacBio `pacbio-sequel2 | pacbio-revio`, +and ONT `nanopore-r9 | nanopore-r10`. + +Since v1.8, higher calling accuracy can be achieved by adding "haplotags" to your input bam/cram file +(using hiphase / whatshap / longshot etc). Dysgu reads HP and PS tags automatically and produces +phased output calls. + +If you are using using reads with higher error rates, or are unsure of the read-accuracy, +it is recommended to set '--divergence auto', to infer a more conservative sequence divergence. +The default is set at 0.02 which can be too stringent for lower accuracy reads and will result in +more reads being filtered and lower sensitivity:: dysgu call --divergence auto --mode nanopore reference.fa temp_dir input.bam > svs.vcf diff --git a/dysgu/post_call.py b/dysgu/post_call.py index c1119b9..945c7f8 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -700,11 +700,11 @@ def get_hp_format(events): if unphased: hp_string += f'_{unphased}' if not hp_string: - hp_string = "." + hp_string = "0" e.haplotype = hp_string else: - e.haplotype = "." + e.haplotype = "0" return max_ps, any_phase_set @@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True): join_phase_sets(events, max_ps + 1) else: for e in events: - e.phase_set = "." + e.phase_set = "-1" return events From 47d642cd6d9935541f8d7390f20a92e5722c4788 Mon Sep 17 00:00:00 2001 From: kcleal Date: Wed, 26 Feb 2025 10:20:37 +0000 Subject: [PATCH 7/8] PSET and HP default to empty string --- dysgu/post_call.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index 945c7f8..d400d10 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -700,11 +700,11 @@ def get_hp_format(events): if unphased: hp_string += f'_{unphased}' if not hp_string: - hp_string = "0" + hp_string = "" e.haplotype = hp_string else: - e.haplotype = "0" + e.haplotype = "" return max_ps, any_phase_set @@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True): join_phase_sets(events, max_ps + 1) else: for e in events: - e.phase_set = "-1" + e.phase_set = "" return events From de5512f1527ffb55660205d8d2efbad936664ff4 Mon Sep 17 00:00:00 2001 From: kcleal Date: Wed, 26 Feb 2025 10:22:08 +0000 Subject: [PATCH 8/8] PSET and HP default to '-1' and '0' --- dysgu/post_call.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dysgu/post_call.py b/dysgu/post_call.py index d400d10..945c7f8 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -700,11 +700,11 @@ def get_hp_format(events): if unphased: hp_string += f'_{unphased}' if not hp_string: - hp_string = "" + hp_string = "0" e.haplotype = hp_string else: - e.haplotype = "" + e.haplotype = "0" return max_ps, any_phase_set @@ -771,7 +771,7 @@ def get_gt_metric2(events, mode, add_gt=True): join_phase_sets(events, max_ps + 1) else: for e in events: - e.phase_set = "" + e.phase_set = "-1" return events