diff --git a/CommandLines.cpp b/CommandLines.cpp index 5e519f5..5cebb2f 100644 --- a/CommandLines.cpp +++ b/CommandLines.cpp @@ -51,6 +51,7 @@ static ko_longopt_t long_options[] = { { "dbg-het-cnt", ko_no_argument, 337}, { "ul-tip", ko_required_argument, 338}, { "low-het", ko_no_argument, 339}, + { "s-base", ko_required_argument, 340}, { 0, 0, 0 } }; @@ -120,7 +121,7 @@ void Print_H(hifiasm_opt_t* asm_opt) fprintf(stderr, " Purge-dups:\n"); fprintf(stderr, " -l INT purge level. 0: no purging; 1: light; 2/3: aggressive [0 for trio; 3 for unzip]\n"); - fprintf(stderr, " -s FLOAT similarity threshold for duplicate haplotigs [%g for -l1/-l2, %g for -l3]\n", + fprintf(stderr, " -s FLOAT similarity threshold for duplicate haplotigs in read-level [%g for -l1/-l2, %g for -l3]\n", asm_opt->purge_simi_rate_l2, asm_opt->purge_simi_rate_l3); fprintf(stderr, " -O INT min number of overlapped reads for duplicate haplotigs [%d]\n", asm_opt->purge_overlap_len); @@ -134,7 +135,9 @@ void Print_H(hifiasm_opt_t* asm_opt) fprintf(stderr, " --h1 FILEs file names of Hi-C R1 [r1_1.fq,r1_2.fq,...]\n"); fprintf(stderr, " --h2 FILEs file names of Hi-C R2 [r2_1.fq,r2_2.fq,...]\n"); fprintf(stderr, " --seed INT RNG seed [%lu]\n", asm_opt->seed); - + fprintf(stderr, " --s-base FLOAT\n"); + fprintf(stderr, " similarity threshold for homology detection in base-level;\n"); + fprintf(stderr, " -1 to disable [%.3g]; -s for read-level (see )\n", asm_opt->trans_base_rate_sec); fprintf(stderr, " --n-weight INT\n"); fprintf(stderr, " rounds of reweighting Hi-C links [%d]\n", asm_opt->n_weight); @@ -215,7 +218,8 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->purge_level_trio = 0; asm_opt->purge_simi_rate_l2 = 0.75; asm_opt->purge_simi_rate_l3 = 0.55; - asm_opt->trans_base_rate = 0.8; + asm_opt->trans_base_rate = 0.93; + asm_opt->trans_base_rate_sec = 0.5; asm_opt->purge_overlap_len = 1; ///asm_opt->purge_overlap_len_hic = 50; asm_opt->recover_atg_cov_min = -1024; @@ -252,6 +256,10 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->ul_ec_round = 3; asm_opt->is_dbg_het_cnt = 0; asm_opt->is_low_het_ul = 0; + asm_opt->is_base_trans = 1; + asm_opt->is_read_trans = 1; + asm_opt->is_topo_trans = 1; + asm_opt->is_bub_trans = 1; } void destory_enzyme(enzyme* f) @@ -771,7 +779,10 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt) else if (c == 337) asm_opt->is_dbg_het_cnt = 1; else if (c == 338) asm_opt->max_short_ul_tip = atol(opt.arg); else if (c == 339) asm_opt->is_low_het_ul = 1; - else if (c == 'l') + else if (c == 340) { + asm_opt->trans_base_rate_sec = atof(opt.arg); + if(asm_opt->trans_base_rate_sec < 0) asm_opt->is_base_trans = 0; + } else if (c == 'l') { ///0: disable purge_dup; 1: purge containment; 2: purge overlap asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg); } @@ -788,10 +799,9 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt) return 1; } } - + if(asm_opt->purge_level_primary > 2) asm_opt->purge_simi_thres = asm_opt->purge_simi_rate_l3; else asm_opt->purge_simi_thres = asm_opt->purge_simi_rate_l2; - if (argc == opt.ind) { diff --git a/CommandLines.h b/CommandLines.h index 013c611..a61bff9 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -4,7 +4,7 @@ #include #include -#define HA_VERSION "0.18.2-r467" +#define HA_VERSION "0.18.4-r496" #define VERBOSE 0 @@ -101,6 +101,7 @@ typedef struct { float purge_simi_rate_l3; float purge_simi_thres; float trans_base_rate; + float trans_base_rate_sec; ///float purge_simi_rate_hic; @@ -130,6 +131,10 @@ typedef struct { int32_t ul_ec_round; uint8_t is_dbg_het_cnt; uint8_t is_low_het_ul; + uint8_t is_base_trans; + uint8_t is_read_trans; + uint8_t is_topo_trans; + uint8_t is_bub_trans; } hifiasm_opt_t; extern hifiasm_opt_t asm_opt; diff --git a/Correct.cpp b/Correct.cpp index d911358..9e07c44 100644 --- a/Correct.cpp +++ b/Correct.cpp @@ -3485,6 +3485,24 @@ inline void recalcate_window_advance(overlap_region_alloc* overlap_list, All_rea ///note!!! need notification end_site = Reserve_Banded_BPM_PATH(y_string, Window_Len, x_string, x_len, threshold, &error, &real_y_start, &(dumy->path_length), dumy->matrix_bit, dumy->path, p->error, p->y_end - y_start); + // if(!(error != (unsigned int)-1)) { + // fprintf(stderr, "[M::%s::]\tqid::%u\tqlen::%lu\tq::[%d,\t%d)\ttid::%u\ttlen::%lu\tt::[%d,\t%d)\te_beg::%d\te_end::%d\terr::%d\n", __func__, + // overlap_list->list[j].x_id, Get_READ_LENGTH((*rref), overlap_list->list[j].x_id), + // p->x_start, p->x_end+1, + // overlap_list->list[j].y_id, Get_READ_LENGTH((*rref), overlap_list->list[j].y_id), + // p->y_start, p->y_end+1, + // p->extra_begin, p->extra_end, p->error); + // fprintf(stderr, "[M::%s::]\tqcal_len::%lld\ttcal_len::%lld\tthres::%d\n", __func__, + // x_len, Window_Len, threshold); + // fprintf(stderr, "qid::%u\nqname::%.*s\n\t%.*s\n", overlap_list->list[j].x_id, + // (int32_t)Get_NAME_LENGTH((*rref), overlap_list->list[j].x_id), + // Get_NAME((*rref), overlap_list->list[j].x_id), (int32_t)x_len, x_string); + + // fprintf(stderr, "tid::%u\ntname::%.*s\n\t%.*s\n", overlap_list->list[j].y_id, + // (int32_t)Get_NAME_LENGTH((*rref), overlap_list->list[j].y_id), + // Get_NAME((*rref), overlap_list->list[j].y_id), (int32_t)Window_Len, y_string); + + // } assert(error != (unsigned int)-1); { @@ -14918,6 +14936,13 @@ int64_t ql, int64_t tl, double e_rate, int64_t h_khit, int64_t mode, int64_t rid ws = qs; if(ws < z->x_pos_s) ws = z->x_pos_s; we = qe-1; if(we > z->x_pos_e) we = z->x_pos_e; wsk = get_win_id_by_s(z, ((ws/wl)*wl), wl, NULL); + // if(rid == 7) { + // fprintf(stderr, "[M::%s::]\tutg%.6u%c\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\tq::[%ld,%ld)\tw::[%ld,%ld]\twn::%ld\twsk::%ld\n", __func__, + // z->x_id+1, "lc"[uref->ug->u.a[z->x_id].circ], + // z->x_pos_s, z->x_pos_e+1, "+-"[z->y_pos_strand], + // z->y_id+1, "lc"[uref->ug->u.a[z->y_id].circ], + // z->y_pos_s, z->y_pos_e+1, qs, qe, ws, we, wn, wsk); + // } // assert((ws>=z->w_list.a[wsk].x_start) && (ws<=z->w_list.a[wsk].x_end)); // wek = get_win_id_by_e(z, ((we/wl)*wl), wl, NULL); // assert((we>=z->w_list.a[wek].x_start) && (we<=z->w_list.a[wek].x_end)); @@ -18854,6 +18879,61 @@ kv_rtrace_t *trace, uint64_t ql, uint64_t rid, uint64_t oid, uint64_t khit, over } } + +int64_t ul_raw_aln(overlap_region *z, Candidates_list *cl, const ul_idx_t *udb, char* qstr, UC_Read *tu, +bit_extz_t *exz, double e_rate, int64_t w_l, uint64_t ql, uint64_t rid, uint64_t khit, overlap_region *aux_o) +{ + return_t_chain(z, cl); + int64_t ch_idx = z->shared_seed, ch_n; + int64_t i, tl, id = z->y_id, m; ul_ov_t ov; + k_mer_hit *ch_a = cl->list + ch_idx; + tl = udb->ug->u.a[id].len; + for (i = ch_idx; i < cl->length && cl->list[i].readID == cl->list[ch_idx].readID; i++); ch_n = i-ch_idx; + + // on = fusion_chain_ovlp(z, ch_a, ch_n, ov, on, wl, ql, tl); + aux_o->w_list.n = aux_o->w_list.c.n = 0; + aux_o->y_id = z->y_id; aux_o->y_pos_strand = z->y_pos_strand; + aux_o->x_pos_s = z->x_pos_s; aux_o->x_pos_e = z->x_pos_e; + aux_o->y_pos_s = z->y_pos_s; aux_o->y_pos_e = z->y_pos_e; + + memset(&ov, 0, sizeof(ov)); ov.sec = 3; + ov.ts = ov.te = (uint32_t)-1; + ov.qs = 0; ov.qn = (uint32_t)-1; //extension to left + ov.qe = ql; ov.tn = ch_n; //extension to right + + ovlp_base_direct(z, ch_a, ch_n, &ov, w_l, udb, NULL, NULL, qstr, tu, exz, aux_o, e_rate, ql, tl, rid); + + int64_t aux_n = aux_o->w_list.n; + for (i = 0; i < aux_n; i++) { + if(!(is_ualn_win(aux_o->w_list.a[i]))) continue; + //will overwrite ch_a; does not matter + rechain_aln(z, cl, aux_o, i, w_l, udb, NULL, NULL, qstr, tu, exz, e_rate, ql, tl, khit, rid); + } + if(((int64_t)aux_o->w_list.n) > aux_n) { + for (i = m = 0; i < ((int64_t)aux_o->w_list.n); i++) { + if((i < aux_n) && (is_ualn_win(aux_o->w_list.a[i]))) continue; + aux_o->w_list.a[m++] = aux_o->w_list.a[i]; + } + aux_o->w_list.n = m; + radix_sort_window_list_xs_srt(aux_o->w_list.a, aux_o->w_list.a+aux_o->w_list.n); + } + + ///update z by aux_o + update_overlap_region(z, aux_o, ql, tl); + + int64_t zwn = z->w_list.n, zerr = 0, zlen = z->x_pos_e+1-z->x_pos_s; + for (i = 0; i < zwn; i++) { + if(is_ualn_win(z->w_list.a[i])) zerr += z->w_list.a[i].x_end+1-z->w_list.a[i].x_start; + else zerr += z->w_list.a[i].error; + } + z->non_homopolymer_errors = zerr; + + if(zerr >= zlen) return 0; + if(zerr <= 0 && zlen > 0) return 1; + if(zerr > (zlen*e_rate)) return 0; + return 1; +} + void dedup_ul_ov_t(kv_ul_ov_t *in) { ul_ov_t *a = in->a; int64_t k, l, m, z, r, a_n = in->n; uint64_t qo, to; double rr = 0.95; @@ -18998,4 +19078,95 @@ void ul_raw_lalign_adv(overlap_region_alloc* ol, Candidates_list *cl, const ul_i // copy_asg_arr(hap->snp_srt, iidx); copy_asg_arr(v_idx->a, buf); copy_asg_arr((*stb), buf1); // } } -**/ \ No newline at end of file +**/ + +uint64_t gen_nkhits(Candidates_list *cl, overlap_region *z) +{ + uint64_t pid, kn; int64_t k; + pid = cl->list[z->shared_seed].readID;kn = 0; + for (k = z->shared_seed; k < cl->length && cl->list[k].readID == pid; k++) kn++; + return kn; +} + +void ug_lalign(overlap_region_alloc* ol, Candidates_list *cl, const ul_idx_t *uref, const ug_opt_t *uopt, + char *qstr, uint64_t ql, UC_Read* qu, UC_Read* tu, Correct_dumy* dumy, bit_extz_t *exz, + overlap_region *aux_o, double e_rate, int64_t wl, int64_t sid, uint64_t khit, + uint64_t chain_cut, void *km) +{ + uint64_t i, bs, k, ovl, pid, kl, kn; Window_Pool w; double err; + overlap_region t; overlap_region *z; + ol->mapped_overlaps_length = 0; + if(ol->length <= 0) return; + + ///base alignment + clear_Correct_dumy(dumy, ol, km); err = e_rate; + init_Window_Pool(&w, ql, wl, (int)(1.0/err)); + bs = (w.window_length)+(THRESHOLD_MAX_SIZE<<1)+1; + resize_UC_Read(tu, bs<<1); + + if(!aux_o) { + resize_UC_Read(qu, ql); qu->length = ql; memcpy(qu->seq, qstr, ql); + for (i = 0; i < ol->length; i++) { + z = &(ol->list[i]); ovl = z->x_pos_e+1-z->x_pos_s; + z->shared_seed = z->non_homopolymer_errors;///for index + pid = cl->list[z->shared_seed].readID; kl = cl->length; kn = 0; + for (k = z->shared_seed; k < kl && cl->list[k].readID == pid && kn < chain_cut; k++) kn++; + if(kn < chain_cut) continue; + + if(!align_ul_ed_post_extz(z, uref, NULL, qu->seq, tu->seq, exz, err, w.window_length, -1, 0, km)) { + continue; + } + if(uref && simi_pass(ovl, z->align_length, uref?1:0, -1, NULL)) { + z->is_match = 3; ol->mapped_overlaps_length += z->align_length; + } + } + + // if(uref && ol->mapped_overlaps_length > 0) { + // set_herror_win(ol, dumy, v_idx, err, ql, w.window_length); + // } + + double e_max = err*1.5, rr; int64_t re; + for (i = k = 0; i < ol->length; i++) { + z = &(ol->list[i]); ovl = z->x_pos_e + 1 - z->x_pos_s; + rr = gen_extend_err_exz(z, uref, NULL, NULL, qu->seq, tu->seq, exz, NULL/**v_idx?v_idx->a.a:NULL**/, w.window_length, -1, err, (e_max+0.000001), &re); + z->is_match = 0;///must be here; + + // if(sid == 7 && z->y_id == 135) { + // fprintf(stderr, "===utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\talign_length::%u\terr::%f\trr::%f\twn::%u\n", + // z->x_id+1, "lc"[uref->ug->u.a[z->x_id].circ], uref->ug->u.a[z->x_id].len, + // z->x_pos_s, z->x_pos_e+1, "+-"[z->y_pos_strand], + // z->y_id+1, "lc"[uref->ug->u.a[z->y_id].circ], uref->ug->u.a[z->y_id].len, + // z->y_pos_s, z->y_pos_e+1, z->align_length, err, rr, (uint32_t)z->w_list.n/**gen_nkhits(cl, z)**/); + // } + if (rr <= err) { + if(k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + ol->list[k].is_match = 1; ol->list[k].non_homopolymer_errors = re; + k++; + } + } + + ol->length = k; + if(ol->length <= 0) return; + } else { + if(ol->length <= 1) return; + for (i = k = 0; i < ol->length; i++) { + z = &(ol->list[i]); + if(!ul_raw_aln(z, cl, uref, qu->seq, tu, exz, err, wl, ql, sid, khit, aux_o)) { + z->is_match = 0; + continue; + } + if(k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + ol->list[k].is_match = 1; + k++; + } + ol->length = k; + } +} \ No newline at end of file diff --git a/Correct.h b/Correct.h index 675263a..c82755d 100644 --- a/Correct.h +++ b/Correct.h @@ -1160,6 +1160,11 @@ void lchain_align(overlap_region_alloc* overlap_list, const ul_idx_t *uref, kvec_t_u64_warp* v_idx, window_list_alloc* win_ciagr_buf, int force_repeat, int is_consensus, int* fully_cov, int* abnormal, double max_ov_diff_ec, long long winLen, void *km); + +void ug_lalign(overlap_region_alloc* ol, Candidates_list *cl, const ul_idx_t *uref, const ug_opt_t *uopt, + char *qstr, uint64_t ql, UC_Read* qu, UC_Read* tu, Correct_dumy* dumy, bit_extz_t *exz, + overlap_region *aux_o, double e_rate, int64_t wl, int64_t sid, uint64_t khit, uint64_t chain_cut, + void *km); /*** type: 0. match diff --git a/Hash_Table.cpp b/Hash_Table.cpp index 051ee8c..796afad 100644 --- a/Hash_Table.cpp +++ b/Hash_Table.cpp @@ -1823,8 +1823,10 @@ uint64_t lchain_qdp_mcopy(Candidates_list *cl, int64_t a_idx, int64_t a_n, int64 Chain_Data* dp, overlap_region_alloc* res, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, uint32_t xid, int64_t xl, int64_t yl, int64_t quick_check, uint32_t apend_be, - int64_t gen_cigar) + int64_t gen_cigar, int64_t enable_mcopy, double mcopy_rate, int64_t mcopy_khit_cutoff, + int64_t khit_n) { + if(a_n <= 0) return 0; int64_t *p, *t, max_f, n_skip, st, max_j, end_j, sc, msc, msc_i, bw, max_ii, ovl, movl, plus = 0, min_sc, ch_n; int32_t *f, max, tmp, *ii; int64_t i, k, j, cL = 0; k_mer_hit* a; k_mer_hit* des; k_mer_hit *swap; overlap_region *z; resize_Chain_Data(dp, a_n, NULL); @@ -1884,86 +1886,102 @@ uint64_t lchain_qdp_mcopy(Candidates_list *cl, int64_t a_idx, int64_t a_n, int64 } } if(f[i] < plus) plus = f[i]; + ii[i] = 0;///for mcopy, not here // if(a_n && a[0].readID == 0) { // fprintf(stderr, "i::%ld[M::%s::utg%.6dl::%c] x::%u, y::%u, st::%ld, max_ii::%ld, f[i]::%d, p[i]::%ld, msc_i::%ld, msc::%ld, movl::%ld\n", // i, __func__, (int32_t)a[i].readID+1, "+-"[a[i].strand], // a[i].self_offset, a[i].offset, st, max_ii, f[i], p[i], msc_i, msc, movl); // } } - if((movl < xl) && (movl < yl)) { - msc -= plus; min_sc = msc*0.2; - for (i = ch_n = 0; i < a_n; ++i) {///make all f[] positive - f[i] -= plus; if(i >= ch_n) t[i] = 0; - if(f[i] >= min_sc) { - t[ch_n] = ((uint64_t)f[i])<<32; t[ch_n] += (i<<1); ch_n++; - } - } - int64_t n_v, n_v0, ni, n_u, n_u0 = res->length; - radix_sort_hc64i(t, t + ch_n); - for (k = ch_n-1, n_v = n_u = 0; k >= 0; --k) { - n_v0 = n_v; - for (i = ((uint32_t)t[k])>>1; i >= 0 && (t[i]&1) == 0; ) { - ii[n_v++] = i; t[i] |= 1; i = p[i]; - } - // if(a_n && a[0].readID == 0) { - // fprintf(stderr, "init_sc::%ld[M::%s::k->%ld] n_v0::%ld, n_v::%ld\n", t[k]>>32, __func__, k, n_v0, n_v); - // } - if(n_v0 == n_v) continue; - sc = (i<0?(t[k]>>32):((t[k]>>32)-f[i])); - if(sc >= min_sc) { - // if(a_n && a[0].readID == 0) { - // fprintf(stderr, "sc::%ld[M::%s::] n_v0::%ld, n_v::%ld, k::%ld, ch_n::%ld, msc::%ld, min_sc::%ld\n", - // sc, __func__, n_v0, n_v, k, ch_n, msc, min_sc); - // } - kv_pushp_ol(overlap_region, (*res), &z); - push_ovlp_chain_qgen(z, xid, xl, yl, sc+plus, &(a[ii[n_v-1]]), &(a[ii[n_v0]])); - z->align_length = n_v-n_v0; z->x_id = n_v0; - n_u++; - } else { - n_v = n_v0; + for (i = msc_i, cL = 0; i >= 0; i = p[i]) { ii[i] = 1; t[cL++] = i;}///label the best chain + if((movl < xl) && enable_mcopy/**(movl < yl)**/) { + if(cL >= mcopy_khit_cutoff) {///if there are too few k-mers, disable mcopy + msc -= plus; min_sc = msc*mcopy_rate/**0.2**/; ii[msc_i] = 0; + for (i = ch_n = 0; i < a_n; ++i) {///make all f[] positive + f[i] -= plus; if(i >= ch_n) t[i] = 0; + if((!(ii[i])) && (f[i] >= min_sc)) { + t[ch_n] = ((uint64_t)f[i])<<32; t[ch_n] += (i<<1); ch_n++; + } } - } - - ks_introsort_or_sss(n_u, res->list + n_u0); - res->length = n_u0 + filter_non_ovlp_xchains(res->list + n_u0, n_u, &n_v); - n_u = res->length; + if(ch_n > 1) { + int64_t n_v, n_v0, ni, n_u, n_u0 = res->length; + radix_sort_hc64i(t, t + ch_n); + for (k = ch_n-1, n_v = n_u = 0; k >= 0; --k) { + n_v0 = n_v; + for (i = ((uint32_t)t[k])>>1; i >= 0 && (t[i]&1) == 0; ) { + ii[n_v++] = i; t[i] |= 1; i = p[i]; + } + if(n_v0 == n_v) continue; + sc = (i<0?(t[k]>>32):((t[k]>>32)-f[i])); + if(sc >= min_sc) { + kv_pushp_ol(overlap_region, (*res), &z); + push_ovlp_chain_qgen(z, xid, xl, yl, sc+plus, &(a[ii[n_v-1]]), &(a[ii[n_v0]])); + ///mcopy_khit_cutoff <= 1: disable the mcopy_khit_cutoff filtering, for the realignment + if((mcopy_khit_cutoff <= 1) || ((z->x_pos_e+1-z->x_pos_s) <= (movl<<2))) { + z->align_length = n_v-n_v0; z->x_id = n_v0; + n_u++; + } else {///non-best is too long + res->length--; n_v = n_v0; + } + } else { + n_v = n_v0; + } + } - kv_resize_cl(k_mer_hit, (*cl), (n_v+cl->length)); - a = cl->list + a_idx; des = cl->list + des_idx; swap = cl->list + cl->length; - for (k = n_u0, i = n_v0 = n_v = 0; k < n_u; k++) { - z = &(res->list[k]); - z->non_homopolymer_errors = des_idx + i; - n_v0 = z->x_id; ni = z->align_length; - // if(n_u > 1) { - // fprintf(stderr, "\nk::%ld[M::%s::utg%.6dl::%c] ni::%ld\n", - // i, __func__, (int32_t)des[i].readID+1, "+-"[des[i].strand], ni); - // } - for (j = 0; j < ni; j++, i++) { - ///k0 + (ni - j - 1) - swap[i] = a[ii[n_v0 + (ni- j - 1)]]; - swap[i].readID = k; - // if(a_n && a[0].readID == 0) { - // fprintf(stderr, "i::%ld[M::%s::utg%.6dl::%c] x::%u, y::%u, ni::%ld\n", - // i, __func__, (int32_t)swap[i].readID+1, "+-"[swap[i].strand], - // swap[i].self_offset, swap[i].offset, ni); - // } + if(n_u > 1) ks_introsort_or_sss(n_u, res->list + n_u0); + res->length = n_u0 + filter_non_ovlp_xchains(res->list + n_u0, n_u, &n_v); + n_u = res->length; + if(n_u > n_u0 + 1) { + kv_resize_cl(k_mer_hit, (*cl), (n_v+cl->length)); + a = cl->list + a_idx; des = cl->list + des_idx; swap = cl->list + cl->length; + for (k = n_u0, i = n_v0 = n_v = 0; k < n_u; k++) { + z = &(res->list[k]); + z->non_homopolymer_errors = des_idx + i; + n_v0 = z->x_id; ni = z->align_length; + for (j = 0; j < ni; j++, i++) { + ///k0 + (ni - j - 1) + swap[i] = a[ii[n_v0 + (ni- j - 1)]]; + swap[i].readID = k; + } + z->x_id = xid; + if(gen_cigar) gen_fake_cigar(&(z->f_cigar), z, apend_be, swap+i-ni, ni); + if(!khit_n) z->align_length = 0; + } + memcpy(des, swap, i*sizeof((*swap))); //assert(i == ch_n); + + // fprintf(stderr, "[M::%s::msc->%ld] msc_k_hits::%u, cL::%ld, min_sc::%ld, best_sc::%ld, n_u0_sc::%d, mcopy_rate::%f, # chains::%ld\n", + // __func__, msc, res->list[n_u0].align_length, cL, min_sc, msc+plus, res->list[n_u0].shared_seed, + // mcopy_rate, n_u-n_u0); + } else if(n_u == n_u0 + 1) { + z = &(res->list[n_u0]); k = n_u0; i = 0; + z->non_homopolymer_errors = des_idx + i; + n_v0 = z->x_id; ni = z->align_length; + for (j = 0; j < ni; j++, i++) { + ///k0 + (ni - j - 1) + des[i] = a[ii[n_v0 + (ni- j - 1)]]; + des[i].readID = k; + } + z->x_id = xid; + if(gen_cigar) gen_fake_cigar(&(z->f_cigar), z, apend_be, des+i-ni, ni); + if(!khit_n) z->align_length = 0; + } + return i; + } else { + msc += plus; i = msc_i; cL = 0; + while (i >= 0) {t[cL++] = i; i = p[i];} } - z->x_id = xid; - if(gen_cigar) gen_fake_cigar(&(z->f_cigar), z, apend_be, swap+i-ni, ni); - z->align_length = 0; } - memcpy(des, swap, i*sizeof((*swap))); //assert(i == ch_n); - return i; } ///a[] has been sorted by self_offset - i = msc_i; cL = 0; - while (i >= 0) {t[cL++] = i; i = p[i];} + // i = msc_i; cL = 0; + // while (i >= 0) {t[cL++] = i; i = p[i];} kv_pushp_ol(overlap_region, (*res), &z); push_ovlp_chain_qgen(z, xid, xl, yl, msc, &(a[t[cL-1]]), &(a[t[0]])); for (i = 0; i < cL; i++) {des[i] = a[t[cL-i-1]]; des[i].readID = res->length-1;} z->non_homopolymer_errors = des_idx; if(gen_cigar) gen_fake_cigar(&(z->f_cigar), z, apend_be, des, cL); + if(khit_n) z->align_length = cL; return cL; } diff --git a/Hash_Table.h b/Hash_Table.h index d0aebb3..e6f05c9 100644 --- a/Hash_Table.h +++ b/Hash_Table.h @@ -233,6 +233,7 @@ uint64_t lchain_qdp_mcopy(Candidates_list *cl, int64_t a_idx, int64_t a_n, int64 Chain_Data* dp, overlap_region_alloc* res, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, uint32_t xid, int64_t xl, int64_t yl, int64_t quick_check, uint32_t apend_be, - int64_t gen_cigar); + int64_t gen_cigar, int64_t enable_mcopy, double mcopy_rate, int64_t mcopy_khit_cutoff, + int64_t khit_n); #endif diff --git a/Overlaps.cpp b/Overlaps.cpp index 95c7eb6..27126be 100644 --- a/Overlaps.cpp +++ b/Overlaps.cpp @@ -55,6 +55,9 @@ KRADIX_SORT_INIT(u_trans_qs, u_trans_t, u_trans_qs_key, member_size(u_trans_t, q #define u_trans_ts_key(a) ((a).ts) KRADIX_SORT_INIT(u_trans_ts, u_trans_t, u_trans_ts_key, member_size(u_trans_t, ts)) +#define ha_mzl_t_key(p) ((p).x) +KRADIX_SORT_INIT(ha_mzl_t_srt1, ha_mzl_t, ha_mzl_t_key, member_size(ha_mzl_t, x)) + #define UL_COV_THRES 2 KSORT_INIT_GENERIC(uint32_t) @@ -70,6 +73,22 @@ typedef struct { tip_t *b; }kv_tip_t; +typedef struct { + asg64_v z; + asg64_v idx; +}u_trans_cluster; + +typedef struct { // global data structure for kt_pipeline() + ma_ug_t *ug; + asg_t *rg; + kv_u_trans_t *res; + kv_u_trans_t *ta; + uint64_t n_thread, ov_cutoff; + double small_ov_rate, sec_rate; + asg64_v *srt; + u_trans_cluster *cu; +} u_trans_clean_t; + ///this value has been updated at the first line of build_string_graph_without_clean long long min_thres; @@ -13752,188 +13771,759 @@ void write_dbug(ma_ug_t* ug, FILE* fp) } } -void output_contig_graph_alternative(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, -ma_hit_t_alloc* sources, R_to_U* ruIndex, int max_hang, int min_ovlp); -void clean_u_trans_t_idx(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); -void output_hic_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, -ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, -long long tipsLen, float tip_drop_ratio, long long stops_threshold, -R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, -long long gap_fuzz, bub_label_t* b_mask_t) -{ - hic_clean(sg); - ug_opt_t opt; memset(&opt, 0, sizeof(opt)); - kvec_pe_hit *rhits = NULL; - ma_ug_t *ug_fa = NULL, *ug_mo = NULL; - opt.coverage_cut = coverage_cut; - opt.sources = sources; - opt.reverse_sources = reverse_sources; - opt.tipsLen = (asm_opt.max_short_tip*2); - opt.tip_drop_ratio = 0.15; - opt.stops_threshold = 3; - opt.ruIndex = ruIndex; - opt.chimeric_rate = 0.05; - opt.drop_ratio = 0.9; - opt.max_hang = max_hang; - opt.min_ovlp = min_ovlp; - opt.is_bench = 0; - opt.b_mask_t = b_mask_t; - opt.gap_fuzz = gap_fuzz; +void filter_u_trans(kv_u_trans_t *ta, uint8_t keep_bub, uint8_t keep_topo, uint8_t keep_read, uint8_t keep_base) +{ + if(keep_bub && keep_topo && keep_read && keep_base) return; + uint32_t i, m; + for (i = m = 0; i < ta->n; i++) { + if((!keep_bub) && (ta->a[i].f == RC_0)) continue; + if((!keep_topo) && (ta->a[i].f == RC_1)) continue; + if((!keep_read) && (ta->a[i].f == RC_2)) continue; + if((!keep_base) && (ta->a[i].f == RC_3)) continue; + ta->a[m++] = ta->a[i]; + } + ta->n = m; +} +uint32_t trans_ovlp_connect1(u_trans_t *p, ma_ug_t *ug) +{ + uint32_t v = p->qn<<1, w = (p->tn<<1) + ((uint32_t)p->rev), i, dg = (uint32_t)-1, da, dif, mm; + asg_arc_t *av = asg_arc_a(ug->g, v); uint32_t nv = asg_arc_n(ug->g, v); + for (i = 0; i < nv; i++) { + if(av[i].del || av[i].v != w) continue; + dg = av[i].ol; + break; + } + // if((v>>1) == 43 && (w>>1) == 45) { + // fprintf(stderr, "+[M::%s::] utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tf::%u\n", __func__, + // p->qn+1, "lc"[s->ug->u.a[p->qn].circ], s->ug->u.a[p->qn].len, p->qs, p->qe, + // "+-"[p->rev], p->tn+1, "lc"[s->ug->u.a[p->tn].circ], s->ug->u.a[p->tn].len, p->ts, p->te, p->f); + // } - kvec_asg_arc_t_warp new_rtg_edges, d_edges; - kv_init(new_rtg_edges.a); kv_init(d_edges.a); - ma_ug_t *ug = NULL; - ug = ma_ug_gen_primary(sg, PRIMARY_LABLE); + if(i < nv) { + // if(i >= nv) return 1; + da = (p->qe - p->qs); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; - new_rtg_edges.a.n = 0; - ma_ug_seq(ug, sg, coverage_cut, sources, &new_rtg_edges, max_hang, min_ovlp, &d_edges, 1);///polish - + da = (p->te - p->ts); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; + } - hap_cov_t *cov = NULL; - trans_chain* t_ch = NULL; - if((asm_opt.flag & HA_F_VERBOSE_GFA)) t_ch = load_hc_trans(output_file_name); - - if(!t_ch) - { - new_rtg_edges.a.n = 0; - asg_t *copy_sg = copy_read_graph(sg); - ma_ug_t *copy_ug = copy_untig_graph(ug); - ///asm_opt.purge_overlap_len = asm_opt.purge_overlap_len_hic; - ///asm_opt.purge_simi_thres = asm_opt.purge_simi_rate_hic; - adjust_utg_by_primary(©_ug, copy_sg, TRIO_THRES, sources, reverse_sources, coverage_cut, - tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, - max_hang, min_ovlp, &new_rtg_edges, &cov, b_mask_t, 1, 0); - print_utg(copy_ug, copy_sg, coverage_cut, output_file_name, sources, ruIndex, max_hang, - min_ovlp, &new_rtg_edges); + v ^= 1; w ^= 1; dg = (uint32_t)-1; + av = asg_arc_a(ug->g, v); nv = asg_arc_n(ug->g, v); + for (i = 0; i < nv; i++) { + if(av[i].del || av[i].v != w) continue; + dg = av[i].ol; + break; + } + if(i < nv) { + // if(i >= nv) return 1; + da = (p->qe - p->qs); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; - if(asm_opt.is_alt) - { - output_contig_graph_alternative(copy_sg, coverage_cut, output_file_name, sources, ruIndex, max_hang, - min_ovlp); + da = (p->te - p->ts); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; + } + + return 1; +} + + +uint32_t ovlp_rocc(u_trans_t *p, ma_ug_t *ug, asg_t *rg, double cov_rate, uint32_t cov_cutoff) +{ + ma_utg_t *u = NULL; uint32_t s, e, rs, re, occ, k, l; + u = &(ug->u.a[p->qn]); s = p->qs; e = p->qe; + if((e-s) >= (u->len*cov_rate)) return 1; + for (k = l = occ = 0; k < u->n && occ < cov_cutoff; k++) { + rs = l; re = l + rg->seq[u->a[k]>>33].len; + if(rs >= e) break; + if(s <= rs && e >= re) occ++; + l += (uint32_t)u->a[k]; + } + if(occ >= cov_cutoff) return 1; + + u = &(ug->u.a[p->tn]); s = p->ts; e = p->te; + if((e-s) >= (u->len*cov_rate)) return 1; + for (k = l = occ = 0; k < u->n && occ < cov_cutoff; k++) { + rs = l; re = l + rg->seq[u->a[k]>>33].len; + if(rs >= e) break; + if(s <= rs && e >= re) occ++; + l += (uint32_t)u->a[k]; + } + if(occ >= cov_cutoff) return 1; + return 0; +} + +static void worker_for_trans_clean(void *data, long i, int tid) // callback for kt_for() +{ + u_trans_clean_t *s = (u_trans_clean_t*)data; + kv_u_trans_t *ta = s->ta; + kv_u_trans_t *res = &(s->res[tid]); + u_trans_t *a = NULL, *r_a = NULL, *mz, *cz; + uint32_t n, r_n, id = i, k, l, z; + + a = u_trans_a(*ta, id); n = u_trans_n(*ta, id); + for (k = 1, l = 0; k <= n; k++) { + if(k == n || a[l].tn != a[k].tn) { + if(k > l) { + get_u_trans_spec(ta, a[l].tn, a[l].qn, &r_a, &r_n); + if(r_n > 0 && id > a[l].tn) { + l = k; continue; + } + mz = cz = NULL; + for (z = l; z < k; z++) { + cz = &(a[z]); + // fprintf(stderr, ">[M::%s::] qn::%u, tn::%u, cz->nw::%f, cz->f::%u\n", + // __func__, a[l].qn, a[l].tn, cz->nw, cz->f); + if(mz) { + if((mz->f == RC_0) && (cz->f != RC_0)) continue; + if((mz->f == RC_1) && ((cz->f == RC_2) || (cz->f == RC_3))) continue; + } + if((!mz) || ((cz->f < mz->f) && ((cz->f == RC_0) || (cz->f == RC_1))) || + (mz->nw < cz->nw) || ((mz->nw == cz->nw) && ((mz->qe - mz->qs) < (cz->qe - cz->qs)))) { + // fprintf(stderr, "+[M::%s::] qn::%u, tn::%u, mz->nw::%f, mz->f::%u, cz->nw::%f, cz->f::%u\n", + // __func__, a[l].qn, a[l].tn, mz?mz->nw:-1, mz?mz->f:255, cz->nw, cz->f); + mz = cz; + } + } + for (z = 0; z < r_n; z++) { + cz = &(r_a[z]); + if(mz) { + if((mz->f == RC_0) && (cz->f != RC_0)) continue; + if((mz->f == RC_1) && ((cz->f == RC_2) || (cz->f == RC_3))) continue; + } + if((!mz) || ((cz->f < mz->f) && ((cz->f == RC_0) || (cz->f == RC_1))) || + (mz->nw < cz->nw) || ((mz->nw == cz->nw) && ((mz->qe - mz->qs) < (cz->te - cz->ts)))) { + ///note here is (cz->te - cz->ts) + // fprintf(stderr, "-[M::%s::] qn::%u, tn::%u, mz->nw::%f, mz->f::%u, cz->nw::%f, cz->f::%u\n", + // __func__, a[l].qn, a[l].tn, mz?mz->nw:-1, mz?mz->f:255, cz->nw, cz->f); + mz = cz; + } + } + + // fprintf(stderr, "-[M::%s::] is_mz::%u, qn::%u, tn::%u\n", __func__, (uint32_t)(!!mz), mz->qn, mz->tn); + if((mz) && (mz->qn != mz->tn)) { + ///need to check it cover enough reads + kv_pushp(u_trans_t, *res, &cz); (*cz) = (*mz); + if(mz->qn != id) { + cz->qn = mz->tn; cz->qs = mz->ts; cz->qe = mz->te; + cz->tn = mz->qn; cz->ts = mz->qs; cz->te = mz->qe; + } + + if((cz->nw >= 0) && ((!trans_ovlp_connect1(cz, s->ug)) + || (!ovlp_rocc(cz, s->ug, s->rg, s->small_ov_rate, s->ov_cutoff)))) { + // if(mz->qn == 43 && mz->tn == 45) { + // fprintf(stderr, "-[M::%s::] utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\n", __func__, + // mz->qn+1, "lc"[s->ug->u.a[mz->qn].circ], s->ug->u.a[mz->qn].len, mz->qs, mz->qe, + // "+-"[mz->rev], + // mz->tn+1, "lc"[s->ug->u.a[mz->tn].circ], s->ug->u.a[mz->tn].len, mz->ts, mz->te); + // } + + res->n--; + } + // else { + // if(mz->qn == 43 && mz->tn == 45) { + // fprintf(stderr, "+[M::%s::] utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tf::%u\n", __func__, + // mz->qn+1, "lc"[s->ug->u.a[mz->qn].circ], s->ug->u.a[mz->qn].len, mz->qs, mz->qe, + // "+-"[mz->rev], mz->tn+1, "lc"[s->ug->u.a[mz->tn].circ], s->ug->u.a[mz->tn].len, mz->ts, mz->te, mz->f); + // } + // } + } + } + l = k; } - ma_ug_destroy(copy_ug); - asg_destroy(copy_sg); + } +} + +static void worker_for_trans_clean_re(void *data, long i, int tid) // callback for kt_for() +{ + u_trans_clean_t *s = (u_trans_clean_t*)data; + kv_u_trans_t *ta = s->ta; + kv_u_trans_t *res = &(s->res[tid]); + u_trans_t *a = NULL, *r_a = NULL, *mz, *cz, *sz; + uint32_t n, r_n, id = i, k, l, z; + uint32_t ovq, ovt, os, oe; - clean_u_trans_t_idx(&(cov->t_ch->k_trans), ug, sg); + a = u_trans_a(*ta, id); n = u_trans_n(*ta, id); + for (k = 1, l = 0; k <= n; k++) { + if(k == n || a[l].tn != a[k].tn) { + if(k > l) { + get_u_trans_spec(ta, a[l].tn, a[l].qn, &r_a, &r_n); + if(r_n > 0 && id > a[l].tn) { + l = k; continue; + } - new_rtg_edges.a.n = 0; - ma_ug_print_bed(ug, sg, &R_INF, coverage_cut, sources, &new_rtg_edges, - max_hang, min_ovlp, asm_opt.hic_inconsist_rate, NULL, NULL, cov->t_ch); + mz = cz = sz = NULL; + for (z = l; z < k; z++) { + cz = &(a[z]); + if(cz->f == RC_3) { + if((!sz) || (sz->nw < cz->nw)) sz = cz; + } + if(mz) { + if((mz->f == RC_0) && (cz->f != RC_0)) continue; + if((mz->f == RC_1) && ((cz->f == RC_2) || (cz->f == RC_3))) continue; + } + if((!mz) || ((cz->f < mz->f) && ((cz->f == RC_0) || (cz->f == RC_1))) || + (mz->nw < cz->nw) || ((mz->nw == cz->nw) && ((mz->qe - mz->qs) < (cz->qe - cz->qs)))) { + mz = cz; + } + } + for (z = 0; z < r_n; z++) { + cz = &(r_a[z]); + if(cz->f == RC_3) { + if((!sz) || (sz->nw < cz->nw)) sz = cz; + } + if(mz) { + if((mz->f == RC_0) && (cz->f != RC_0)) continue; + if((mz->f == RC_1) && ((cz->f == RC_2) || (cz->f == RC_3))) continue; + } + if((!mz) || ((cz->f < mz->f) && ((cz->f == RC_0) || (cz->f == RC_1))) || + (mz->nw < cz->nw) || ((mz->nw == cz->nw) && ((mz->qe - mz->qs) < (cz->te - cz->ts)))) { + mz = cz; + } + } - if((asm_opt.flag & HA_F_VERBOSE_GFA)) write_trans_chain(cov->t_ch, output_file_name); + + + // fprintf(stderr, "-[M::%s::] is_mz::%u, qn::%u, tn::%u\n", __func__, (uint32_t)(!!mz), mz->qn, mz->tn); + if((mz) && (mz->qn != mz->tn)) { + ///need to check it cover enough reads + kv_pushp(u_trans_t, *res, &cz); (*cz) = (*mz); + if((sz) && (sz->rev == cz->rev)) { + if((cz->qn != sz->qn) || (cz->tn != sz->tn)) { + cz->qn = mz->tn; cz->qs = mz->ts; cz->qe = mz->te; + cz->tn = mz->qn; cz->ts = mz->qs; cz->te = mz->qe; + } + if((cz->qn == sz->qn) && (cz->tn == sz->tn)) { + os = MAX(cz->qs, sz->qs); oe = MIN(cz->qe, sz->qe); + ovq = ((oe > os)? (oe - os):0); + + os = MAX(cz->ts, sz->ts); oe = MIN(cz->te, sz->te); + ovt = ((oe > os)? (oe - os):0); + + if(((ovq) && (ovq > ((cz->qe-cz->qs)*0.8)) && (ovq > ((sz->qe-sz->qs)*0.8))) && + (((ovt) && (ovt > ((cz->te-cz->ts)*0.8)) && (ovt > ((sz->te-sz->ts)*0.8))))) { + (*cz) = (*sz); cz->f = mz->f; + } + } + } + if(cz->qn != id) { + z = cz->qn; cz->qn = cz->tn; cz->tn = z; + z = cz->qs; cz->qs = cz->ts; cz->ts = z; + z = cz->qe; cz->qe = cz->te; cz->te = z; + } + + if((cz->nw >= 0) && ((!trans_ovlp_connect1(cz, s->ug)) + || (!ovlp_rocc(cz, s->ug, s->rg, s->small_ov_rate, s->ov_cutoff)))) { + res->n--; + } + } + } + l = k; + } } +} +int cmp_u_trans_weight(const void * a, const void * b) +{ + if((*(u_trans_t*)a).nw == (*(u_trans_t*)b).nw) return 0; + return ((*(u_trans_t*)a).nw) > ((*(u_trans_t*)b).nw)?-1:1; +} - ///for debug - // char* gfa_name = (char*)malloc(strlen(output_file_name)+50); - // sprintf(gfa_name, "%s.pre.clean_d_utg.noseq.gfa", output_file_name); - // FILE* output_file = fopen(gfa_name, "w"); - // ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); - // fclose(output_file); - // free(gfa_name); - // exit(1); - ///for debug +int64_t infer_utrans_ovlp_len(u_trans_t *li, u_trans_t *lj, ma_ug_t *ug) +{ + int64_t in, is, ie, irev, iqs, iqe, jn, js, je, jrev, jqs, jqe, ir, jr, ts, te, max_s, min_e, s_shift, e_shift; + + in = ug->u.a[li->tn].len; + is = li->ts; ie = li->te; irev = li->rev; iqs = li->qs; iqe = li->qe; + jn = ug->u.a[lj->tn].len; + js = lj->ts; je = lj->te; jrev = lj->rev; jqs = lj->qs; jqe = lj->qe; + + max_s = MAX(iqs, jqs); min_e = MIN(iqe, jqe); + if(min_e <= max_s) return 0; + s_shift = get_offset_adjust(max_s - iqs, iqe-iqs, ie-is); + e_shift = get_offset_adjust(iqe - min_e, iqe-iqs, ie-is); + if(irev) { + ts = s_shift; s_shift = e_shift; e_shift = ts; + } + is += s_shift; ie-= e_shift; + s_shift = get_offset_adjust(max_s - jqs, jqe-jqs, je-js); + e_shift = get_offset_adjust(jqe - min_e, jqe-jqs, je-js); + if(jrev) { + ts = s_shift; s_shift = e_shift; e_shift = ts; + } + js += s_shift; je-= e_shift; + if(irev) { + ts = in - ie; te = in - is; + is = ts; ie = te; + } - hic_analysis(ug, sg, cov?cov->t_ch:t_ch, &opt, 0, asm_opt.scffold?&rhits:NULL); + if(jrev) { + ts = jn - je; te = jn - js; + js = ts; je = te; + } + + if(is <= js) { + js -= is; is = 0; + } else { + is -= js; js = 0; + } + ir = in - ie; jr = jn - je; - if(!rhits && cov) destory_hap_cov_t(&cov); - if(!rhits && t_ch) destory_trans_chain(&t_ch); + if(ir <= jr){ + ie = in; je += ir; + } + else { + je = jn; ie += jr; + } + ir = ie - is; jr = je - js; + return MAX(ir, jr); +} - // char* gfa_name = (char*)malloc(strlen(output_file_name)+50); - // sprintf(gfa_name, "%s.after.clean_d_utg.noseq.gfa", output_file_name); - // FILE* output_file = fopen(gfa_name, "w"); - // ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); - // fclose(output_file); - // free(gfa_name); +///li is the suffix of lj +uint32_t is_connect_arc(u_trans_t *li, u_trans_t *lj, ma_ug_t *ug, double diff_ec_ul) +{ + if(li->qs >= lj->qs && li->qe >= lj->qe) { + int64_t dq = infer_utrans_ovlp_len(li, lj, ug); + uint32_t v = (li->tn<<1)|li->rev, w = (lj->tn<<1)|lj->rev; + int64_t dt = -1, dif, mm; uint32_t nv, i; asg_arc_t *av; + nv = asg_arc_n(ug->g, v); av = asg_arc_a(ug->g, v); + for (i = 0; i < nv; i++) { + if(av[i].del || av[i].v != w) continue; + dt = av[i].ol; + break; + } + + if(dt < 0) return 0; + dif = (dq>dt? dq-dt:dt-dq); + mm = MAX(dq, dt); mm *= diff_ec_ul; if(mm < 8) mm = 8; + if(dif <= mm) return 1; + } + return 0; +} + +uint32_t test_arc_rm(ma_ug_t *ug, u_trans_t *a, uint32_t a_n, uint32_t a_k, uint32_t len, asg64_v *srt, uint32_t len_cut, double rate_cut) +{ + uint32_t z, i, l, os, oe, ovq; + + srt->n = 0; kv_resize(uint64_t, *srt, a_n); + for (z = 0; z < a_n; z++) { + if(z == a_k) continue; + if(a[z].occ == ((uint32_t)-1)) continue; + srt->a[srt->n] = a[z].qs; + srt->a[srt->n] <<= 32; + srt->a[srt->n] |= a[z].qe; + srt->n++; + } + radix_sort_arch64(srt->a, srt->a+srt->n); + for (i = z = 0; i < srt->n; i++) { + os = (uint32_t)(srt->a[i]>>32); oe = (uint32_t)srt->a[i]; + if(z > 0 && os <= ((uint32_t)srt->a[z-1])) { + if(oe > ((uint32_t)srt->a[z-1])) { + srt->a[z-1] >>= 32; srt->a[z-1] <<= 32; srt->a[z-1] |= oe; + } + } else { + srt->a[z++] = srt->a[i]; + } + } + srt->n = z; l = (a[a_k].qe - a[a_k].qs); + for (z = 0; (z < srt->n) && (l > 0); z++) { + os = MAX(a[a_k].qs, ((uint32_t)(srt->a[z]>>32))); + oe = MIN(a[a_k].qe, ((uint32_t)srt->a[z])); + ovq = ((oe > os)? (oe - os):0); + assert(l >= ovq); l -= ovq; + } + // if((a[a_k].qn == 3924 && a[a_k].tn == 160) || (a[a_k].tn == 3924 && a[a_k].qn == 160)) { + // fprintf(stderr, "[M::%s::]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\tdel::%u\tlen::%u\tlen_cut::%u\trate_cut::%f\tl::%u\n", __func__, + // a[a_k].qn+1, "lc"[ug->u.a[a[a_k].qn].circ], ug->u.a[a[a_k].qn].len, a[a_k].qs, a[a_k].qe, "+-"[a[a_k].rev], + // a[a_k].tn+1, "lc"[ug->u.a[a[a_k].tn].circ], ug->u.a[a[a_k].tn].len, a[a_k].ts, a[a_k].te, a[a_k].nw, a[a_k].f, + // (a[a_k].occ == ((uint32_t)-1))?1:0, len, len_cut, rate_cut, l); + // } + if((l > len_cut) && (l > (len*rate_cut))) return 0;///cannot be dropped + return 1; +} + +void deep_clean_u_trans(kv_u_trans_t *ta, u_trans_t *mz, ma_ug_t *ug, asg64_v *srt, uint32_t len_cut, double rate_cut) +{ + if(mz->qn > mz->tn) return; + u_trans_t *r_a, *cz, *a; uint32_t r_k, r_n, n, k; + r_a = u_trans_a(*ta, mz->tn); r_n = u_trans_n(*ta, mz->tn); + for (r_k = 0; (r_k < r_n) && (r_a[r_k].tn != mz->qn); r_k++); + assert(r_k < r_n); cz = &(r_a[r_k]); + // if((mz->qn == 3924 && mz->tn == 160) || (mz->tn == 3924 && mz->qn == 160)) { + // fprintf(stderr, "[M::%s::]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\tdel::%u\n", __func__, + // mz->qn+1, "lc"[ug->u.a[mz->qn].circ], ug->u.a[mz->qn].len, mz->qs, mz->qe, "+-"[mz->rev], + // mz->tn+1, "lc"[ug->u.a[mz->tn].circ], ug->u.a[mz->tn].len, mz->ts, mz->te, mz->nw, mz->f, + // (mz->occ == ((uint32_t)-1))?1:0); + // fprintf(stderr, "[M::%s::]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\tdel::%u\n", __func__, + // cz->qn+1, "lc"[ug->u.a[cz->qn].circ], ug->u.a[cz->qn].len, cz->qs, cz->qe, "+-"[cz->rev], + // cz->tn+1, "lc"[ug->u.a[cz->tn].circ], ug->u.a[cz->tn].len, cz->ts, cz->te, cz->nw, cz->f, + // (cz->occ == ((uint32_t)-1))?1:0); + // } + if((mz->occ == ((uint32_t)-1)) && (cz->occ == ((uint32_t)-1))) return; + if((mz->occ != ((uint32_t)-1)) && (cz->occ != ((uint32_t)-1))) return; + + if(!test_arc_rm(ug, r_a, r_n, r_k, ug->u.a[mz->tn].len, srt, len_cut, rate_cut)) { + mz->occ = cz->occ = 0; return; + } else { + a = u_trans_a(*ta, mz->qn); n = u_trans_n(*ta, mz->qn); + for (k = 0; (k < n) && (a[k].tn != mz->tn); k++); assert(k < n); + if(!test_arc_rm(ug, a, n, k, ug->u.a[mz->qn].len, srt, len_cut, rate_cut)) { + mz->occ = cz->occ = 0; return; + } + } + mz->occ = cz->occ = ((uint32_t)-1); +} + +static void worker_for_trans_sec_cut(void *data, long i, int tid) // callback for kt_for() +{ + u_trans_clean_t *s = (u_trans_clean_t*)data; + kv_u_trans_t *ta = s->ta; + asg64_v *srt = &(s->srt[tid]); + u_trans_t *a = NULL, *mz, *cz, t; + uint32_t n, r_n, id = i, k, l, z, m, wm, wl, len; + uint32_t ovq, os, oe; uint64_t *ss, *hs, *bu, *wu; + + if(!(s->cu)) { + a = u_trans_a(*ta, id); n = u_trans_n(*ta, id); + for (k = r_n = 0; k < n; k++) {///RC_0/RC_1 to a[0, r_n) + if((a[k].f == RC_0) || (a[k].f == RC_1)) { + if(k != r_n) { + t = a[k]; a[k] = a[r_n]; a[r_n] = t; + } + r_n++; + } + } + if(r_n >= n) return; + if(n-r_n > 1) qsort(a+r_n, n-r_n, sizeof((*a)), cmp_u_trans_weight); + // if(id == 3924 || id == 160) { + // fprintf(stderr, "[M::%s::]\tutg%.6u%c\tr_n::%u\tn::%u\n", __func__, + // id + 1, "lc"[s->ug->u.a[id].circ], r_n, n); + // for (k = 0; k < n; k++) { + // fprintf(stderr, "+[%u]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\n", k, + // a[k].qn+1, "lc"[s->ug->u.a[a[k].qn].circ], s->ug->u.a[a[k].qn].len, a[k].qs, a[k].qe, "+-"[a[k].rev], + // a[k].tn+1, "lc"[s->ug->u.a[a[k].tn].circ], s->ug->u.a[a[k].tn].len, a[k].ts, a[k].te, a[k].nw, a[k].f); + // } + // } + srt->n = 0; kv_resize(uint64_t, *srt, (n<<2)); + for (k = 0; k < n; k++) { + srt->a[srt->n] = a[k].qs; + srt->a[srt->n] <<= 32; + srt->a[srt->n] |= k; + srt->n++; + } + ss = srt->a; hs = ss + srt->n; bu = hs + srt->n; wu = bu + srt->n; + radix_sort_arch64(ss, ss+n); + for (k = 0; k < n; k++) { + ss[k] = (uint32_t)ss[k]; hs[ss[k]] = k; + } + for (k = r_n; k < n; k++) { + cz = &(a[k]); len = (cz->qe-cz->qs)*s->sec_rate; m = wm = 0; + for (z = l = wl = 0; z < n; z++) { + if(z == hs[k]) { + assert(ss[z] == k); + continue; + } + if(ss[z] > k) continue;///smaller nw than a[k] + mz = &(a[ss[z]]); + if(mz->qs >= cz->qe) break; + if(mz->occ == ((uint32_t)-1)) continue;///has been deleted + if((mz->f != RC_0) && (mz->f != RC_1) && (cz->nw > (mz->nw*0.95))) continue; + os = MAX(cz->qs, mz->qs); oe = MIN(cz->qe, mz->qe); + ovq = ((oe > os)? (oe - os):0); + if(!ovq) continue; + if(is_connect_arc(cz, mz, s->ug, 0.04) || is_connect_arc(mz, cz, s->ug, 0.04)) continue; + + if((m > 0) && (((uint32_t)bu[m-1]) >= os)) { + if(oe > ((uint32_t)bu[m-1])) { + l += (oe - ((uint32_t)bu[m-1])); + bu[m-1] += (oe - ((uint32_t)bu[m-1])); + } + } else { + l += oe - os; + bu[m] = os; bu[m] <<= 32; bu[m] |= oe; m++; + } + + if((wm > 0) && (((uint32_t)wu[wm-1]) >= mz->qs)) { + if(mz->qe > ((uint32_t)wu[wm-1])) { + wl += (mz->qe - ((uint32_t)wu[wm-1])); + wu[wm-1] += (mz->qe - ((uint32_t)wu[wm-1])); + } + } else { + wl += mz->qe - mz->qs; + wu[wm] = mz->qs; wu[wm] <<= 32; wu[wm] |= mz->qe; wm++; + } + + if((l > 0) && ((l>=len) || (l>=(wl*s->sec_rate)))) { + // if((cz->qn == 3924 && cz->tn == 160) || (cz->tn == 3924 && cz->qn == 160)) { + // fprintf(stderr, "[M::%s]\tutg%.6u%c\t->\tutg%.6u%c\tl::%u\tlen::%u\twl::%u\tsec_rate::%f\n", __func__, + // cz->qn+1, "lc"[s->ug->u.a[cz->qn].circ], + // cz->tn+1, "lc"[s->ug->u.a[cz->tn].circ], + // l, len, wl, s->sec_rate); + // } + cz->occ = ((uint32_t)-1); + break; + } + } + } + // if(id == 3924 || id == 160) { + // for (k = 0; k < n; k++) { + // fprintf(stderr, "-[%u]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\tdel::%u\n", k, + // a[k].qn+1, "lc"[s->ug->u.a[a[k].qn].circ], s->ug->u.a[a[k].qn].len, a[k].qs, a[k].qe, "+-"[a[k].rev], + // a[k].tn+1, "lc"[s->ug->u.a[a[k].tn].circ], s->ug->u.a[a[k].tn].len, a[k].ts, a[k].te, a[k].nw, a[k].f, + // (a[k].occ == ((uint32_t)-1))?1:0); + // } + // } + } else { + assert(((uint32_t)(s->cu->idx.a[id])) > (s->cu->idx.a[id]>>32)+1); + for (z = (s->cu->idx.a[id]>>32); z < ((uint32_t)(s->cu->idx.a[id])); z++) { + a = u_trans_a(*ta, ((uint32_t)s->cu->z.a[z])); + n = u_trans_n(*ta, ((uint32_t)s->cu->z.a[z])); + for (k = 0; k < n; k++) deep_clean_u_trans(ta, &(a[k]), s->ug, srt, 3000000, 0.5); + } + } +} + +void dbg_prt_utg_trans(kv_u_trans_t *ta, ma_ug_t *ug, const char *o_n) +{ + char* gfa_name = (char*)malloc(strlen(o_n)+100); + FILE *fn = NULL; sprintf(gfa_name, "%s.tran.dbg.ovlp.log", o_n); + u_trans_t *p = NULL; uint32_t i; fn = fopen(gfa_name, "w"); + for (i = 0; i < ta->n; i++) { + p = &(ta->a[i]); + fprintf(fn, "utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tw(%f)\tf(%u)\n", + p->qn+1, "lc"[ug->u.a[p->qn].circ], ug->u.a[p->qn].len, p->qs, p->qe, "+-"[p->rev], + p->tn+1, "lc"[ug->u.a[p->tn].circ], ug->u.a[p->tn].len, p->ts, p->te, p->nw, p->f); + } + fclose(fn); free(gfa_name); fprintf(stderr, "[M::%s::] done\n", __func__); +} + +void clean_u_trans_t_idx_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g) +{ + u_trans_clean_t sl; uint64_t k, i, l, occ; ha_mzl_t *tz; + ha_mzl_v srt_a; kv_u_trans_t *bl; u_trans_t *z; + memset(&sl, 0, sizeof(sl)); kv_init(srt_a); + + kt_u_trans_t_idx(ta, ug->g->n_seq); + sl.n_thread = asm_opt.thread_num; sl.ug = ug; sl.rg = read_g; sl.ta = ta; + CALLOC(sl.res, sl.n_thread); + sl.ov_cutoff = 3; sl.small_ov_rate = 0.8; + // dbg_prt_utg_trans(ta, ug, "pre"); + kt_for(sl.n_thread, worker_for_trans_clean, &sl, sl.ta->idx.n); + + for (i = srt_a.n = occ = 0; i < sl.n_thread; i++) { + bl = &(sl.res[i]); + if(!(bl->n)) continue; + for (k = 1, l = 0; k <= bl->n; k++) { + if(k == bl->n || bl->a[k].qn != bl->a[l].qn) { + if(k > l) { + kv_pushp(ha_mzl_t, srt_a, &tz); + tz->x = bl->a[l].qn; tz->x <<= 32; tz->x |= i; + tz->rid = l>>32; tz->pos = (uint32_t)l; + occ += (k - l); + } + l = k; + } + } + } + // fprintf(stderr, "[M::%s::] occ::%lu \n", __func__, occ); + assert(srt_a.n <= sl.ug->u.n); + radix_sort_ha_mzl_t_srt1(srt_a.a, srt_a.a + srt_a.n); + occ <<= 1; ta->n = 0; kv_resize(u_trans_t, *ta, occ); + for (i = 0; i < srt_a.n; i++) { + tz = &(srt_a.a[i]); + bl = &(sl.res[(uint32_t)(tz->x)]); + k = tz->rid; k <<= 32; k += tz->pos; + assert(bl->a[k].qn == (tz->x>>32)); + for (; (k < bl->n) && (bl->a[k].qn == (tz->x>>32)); k++) { + if(bl->a[k].qn == bl->a[k].tn) continue; + kv_pushp(u_trans_t, *ta, &z); + (*z) = bl->a[k]; if(z->f == RC_3) z->f = RC_2; + + kv_pushp(u_trans_t, *ta, &z); + (*z) = bl->a[k]; if(z->f == RC_3) z->f = RC_2; + z->qn = bl->a[k].tn; z->qs = bl->a[k].ts; z->qe = bl->a[k].te; + z->tn = bl->a[k].qn; z->ts = bl->a[k].qs; z->te = bl->a[k].qe; + } + } + + for (k = 0; k < sl.n_thread; k++) free(sl.res[k].a); + free(sl.res); kv_destroy(srt_a); + kt_u_trans_t_idx(ta, ug->g->n_seq); + // dbg_prt_utg_trans(ta, ug, "after"); +} + + +void dbg_prt_utg_extra_trans(kv_u_trans_t *ta, ma_ug_t *ug, const char *o_n) +{ + char* gfa_name = (char*)malloc(strlen(o_n)+100); + FILE *fn = NULL; sprintf(gfa_name, "%s.tran.dbg.ovlp.log", o_n); + u_trans_t *p = NULL; uint32_t i; fn = fopen(gfa_name, "w"); + double nw[2], ml, uml, sec, fw; + for (i = 0; i < ta->n; i++) { + p = &(ta->a[i]); + if((p->f == RC_0) || (p->f == RC_1)) continue; + + sec = ((p->qe-p->qs)*asm_opt.trans_base_rate); + if(sec < ((p->te-p->ts)*asm_opt.trans_base_rate)) { + sec = ((p->te-p->ts)*asm_opt.trans_base_rate); + } + ml = p->qe - p->qs; uml = sec; ml -= uml; + nw[0] = (ml*CHAIN_MATCH) - (uml*CHAIN_UNMATCH); + ml = p->te - p->ts; uml = sec; ml -= uml; + nw[1] = (ml*CHAIN_MATCH) - (uml*CHAIN_UNMATCH); + fw = MIN(nw[0], nw[1]); + if(fw <= p->nw) continue; + + fprintf(fn, "utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tw(%f)\tf(%u)\n", + p->qn+1, "lc"[ug->u.a[p->qn].circ], ug->u.a[p->qn].len, p->qs, p->qe, "+-"[p->rev], + p->tn+1, "lc"[ug->u.a[p->tn].circ], ug->u.a[p->tn].len, p->ts, p->te, p->nw, p->f); + } + fclose(fn); free(gfa_name); fprintf(stderr, "[M::%s::] done\n", __func__); +} + +u_trans_cluster* gen_u_trans_cluster(kv_u_trans_t *ta) +{ + u_trans_cluster *p; CALLOC(p, 1); + p->z.n = p->z.m = ta->idx.n; MALLOC(p->z.a, p->z.n); + memset(p->z.a, -1, sizeof(*(p->z.a))*p->z.n); + u_trans_t *a = NULL; uint64_t n, v, k, l, i, cn, m; asg64_v b; kv_init(b); + for (i = b.n = cn = 0; i < ta->idx.n; i++) { + v = i; + if(p->z.a[v] != ((uint64_t)-1)) continue; + kv_push(uint64_t, b, v); m = 0; + while(b.n) { + v = b.a[--b.n]; + if(p->z.a[v] != ((uint64_t)-1)) continue; + p->z.a[v] = i; p->z.a[v] <<= 32; p->z.a[v] |= v; m++; + a = u_trans_a(*ta, v); n = u_trans_n(*ta, v); + for (k = 0; k < n; k++) { + if(p->z.a[a[k].tn] != ((uint64_t)-1)) continue; + kv_push(uint64_t, b, a[k].tn); + } + } + if(m > 1) cn++; + } + radix_sort_arch64(p->z.a, p->z.a + p->z.n); + p->idx.n = 0; p->idx.m = cn; CALLOC(p->idx.a, p->idx.m); + for (l = 0, k = 1; k <= p->z.n; k++) { + if(k == p->z.n || (p->z.a[k]>>32) != (p->z.a[l]>>32)) { + if(k - l > 1) p->idx.a[p->idx.n++] = (l<<32)|(k); + l = k; + } + } + assert(p->idx.n == cn); + free(b.a); + return p; +} + +void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g) +{ + u_trans_clean_t sl; uint64_t k, i, l, st, occ; ha_mzl_t *tz; + ha_mzl_v srt_a; kv_u_trans_t *bl; u_trans_t *z; + memset(&sl, 0, sizeof(sl)); kv_init(srt_a); + + kt_u_trans_t_idx(ta, ug->g->n_seq); + sl.n_thread = asm_opt.thread_num; sl.ug = ug; sl.rg = read_g; sl.ta = ta; + CALLOC(sl.res, sl.n_thread); + sl.ov_cutoff = 3; sl.small_ov_rate = 0.8; + // dbg_prt_utg_trans(ta, ug, "pre"); + kt_for(sl.n_thread, worker_for_trans_clean_re, &sl, sl.ta->idx.n); + + for (i = srt_a.n = occ = 0; i < sl.n_thread; i++) { + bl = &(sl.res[i]); + if(!(bl->n)) continue; + for (k = 1, l = 0; k <= bl->n; k++) { + if(k == bl->n || bl->a[k].qn != bl->a[l].qn) { + if(k > l) { + kv_pushp(ha_mzl_t, srt_a, &tz); + tz->x = bl->a[l].qn; tz->x <<= 32; tz->x |= i; + tz->rid = l>>32; tz->pos = (uint32_t)l; + occ += (k - l); + } + l = k; + } + } + } + // fprintf(stderr, "[M::%s::] occ::%lu \n", __func__, occ); + assert(srt_a.n <= sl.ug->u.n); + radix_sort_ha_mzl_t_srt1(srt_a.a, srt_a.a + srt_a.n); + occ <<= 1; ta->n = 0; kv_resize(u_trans_t, *ta, occ); + for (i = 0; i < srt_a.n; i++) { + tz = &(srt_a.a[i]); + bl = &(sl.res[(uint32_t)(tz->x)]); + k = tz->rid; k <<= 32; k += tz->pos; + assert(bl->a[k].qn == (tz->x>>32)); + for (; (k < bl->n) && (bl->a[k].qn == (tz->x>>32)); k++) { + if(bl->a[k].qn == bl->a[k].tn) continue; + kv_pushp(u_trans_t, *ta, &z); + (*z) = bl->a[k]; z->occ = 0; if(z->f == RC_3) z->f = RC_2; + + kv_pushp(u_trans_t, *ta, &z); + (*z) = bl->a[k]; z->occ = 0; if(z->f == RC_3) z->f = RC_2; + z->qn = bl->a[k].tn; z->qs = bl->a[k].ts; z->qe = bl->a[k].te; + z->tn = bl->a[k].qn; z->ts = bl->a[k].qs; z->te = bl->a[k].qe; + } + } - ma_ug_destroy(ug); - kv_destroy(new_rtg_edges.a); + for (k = 0; k < sl.n_thread; k++) free(sl.res[k].a); + free(sl.res); kv_destroy(srt_a); + kt_u_trans_t_idx(ta, ug->g->n_seq); + // dbg_prt_utg_trans(ta, ug, "after"); - asg_arc_t* av = NULL; - uint32_t v, w, k, i, nv; - for (i = 0; i < d_edges.a.n; i++) - { - v = d_edges.a.a[i].ul>>32; - w = d_edges.a.a[i].v; - av = asg_arc_a(sg, v); - nv = asg_arc_n(sg, v); - for (k = 0; k < nv; k++) - { - if(av[k].del) continue; - if(av[k].v == w) - { - av[k].del = 1; - break; - } - } - } - kv_destroy(d_edges.a); - asg_cleanup(sg); - reduce_hamming_error(sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz); + CALLOC(sl.srt, sl.n_thread); sl.sec_rate = 0.5; + kt_for(sl.n_thread, worker_for_trans_sec_cut, &sl, sl.ta->idx.n); + sl.cu = gen_u_trans_cluster(ta); + kt_for(sl.n_thread, worker_for_trans_sec_cut, &sl, sl.cu->idx.n); + free(sl.cu->idx.a); free(sl.cu->z.a); free(sl.cu); + for (k = 0; k < sl.n_thread; k++) free(sl.srt[k].a); free(sl.srt); - ug_fa = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL, NULL, NULL); - ug_mo = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL, NULL, NULL); - if(rhits) - { - ha_aware_order(rhits, sg, ug_fa, ug_mo, cov?&(cov->t_ch->k_trans):&(t_ch->k_trans), &opt, 3); - kv_destroy(rhits->a); kv_destroy(rhits->idx); kv_destroy(rhits->occ); free(rhits); - if(cov) destory_hap_cov_t(&cov); - if(t_ch) destory_trans_chain(&t_ch); - ma_ug_destroy(ug_fa); ma_ug_destroy(ug_mo); - } + for (k = st = 0; k < ta->n; k++) { + if(ta->a[k].occ == ((uint32_t)-1)) continue; + ta->a[st++] = ta->a[k]; + } + ta->n = st; + for (st = 0, i = 1; i <= ta->n; ++i) { + if (i == ta->n || ta->a[i].qn != ta->a[st].qn) { + ta->idx.a[ta->a[st].qn] = (((uint64_t)st)<<32)|(i-st); + st = i; + } + } + // dbg_prt_utg_extra_trans(ta, ug, asm_opt.output_file_name); } -void print_debug_gfa(ma_ug_t *ug, asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, -ma_hit_t_alloc* sources, R_to_U* ruIndex) +void refine_hic_trans(ug_opt_t *opt, kv_u_trans_t *ta, asg_t *sg, ma_ug_t *ug) { - char* gfa_name = (char*)malloc(strlen(output_file_name)+50); - sprintf(gfa_name, "%s.after.clean_d_utg.gfa", output_file_name); - FILE* output_file = fopen(gfa_name, "w"); - ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); - fclose(output_file); - - sprintf(gfa_name, "%s.after.clean_d_utg.noseq.gfa", output_file_name); - output_file = fopen(gfa_name, "w"); - ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); - fclose(output_file); - - free(gfa_name); + filter_u_trans(ta, asm_opt.is_bub_trans, asm_opt.is_topo_trans, asm_opt.is_read_trans, asm_opt.is_base_trans); + if(asm_opt.is_base_trans) { + trans_base_infer(ug, sg, opt, ta, NULL); + } + // dbg_prt_utg_trans(ta, ug, "pre"); + clean_u_trans_t_idx_filter_adv(ta, ug, sg); + // dbg_prt_utg_trans(ta, ug, "after"); } -void output_ul_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, +void output_contig_graph_alternative(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, +ma_hit_t_alloc* sources, R_to_U* ruIndex, int max_hang, int min_ovlp); +void output_hic_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, -long long gap_fuzz, bub_label_t* b_mask_t) +long long gap_fuzz, bub_label_t* b_mask_t, ug_opt_t *opt) { hic_clean(sg); - ug_opt_t opt; memset(&opt, 0, sizeof(opt)); - // kvec_pe_hit *rhits = NULL; - // ma_ug_t *ug_fa = NULL, *ug_mo = NULL; - opt.coverage_cut = coverage_cut; - opt.sources = sources; - opt.reverse_sources = reverse_sources; - opt.tipsLen = (asm_opt.max_short_tip*2); - opt.tip_drop_ratio = 0.15; - opt.stops_threshold = 3; - opt.ruIndex = ruIndex; - opt.chimeric_rate = 0.05; - opt.drop_ratio = 0.9; - opt.max_hang = max_hang; - opt.min_ovlp = min_ovlp; - opt.is_bench = 0; - opt.b_mask_t = b_mask_t; - opt.gap_fuzz = gap_fuzz; - + kvec_pe_hit *rhits = NULL; + ma_ug_t *ug_fa = NULL, *ug_mo = NULL; kvec_asg_arc_t_warp new_rtg_edges, d_edges; kv_init(new_rtg_edges.a); kv_init(d_edges.a); @@ -13942,9 +14532,7 @@ long long gap_fuzz, bub_label_t* b_mask_t) new_rtg_edges.a.n = 0; ma_ug_seq(ug, sg, coverage_cut, sources, &new_rtg_edges, max_hang, min_ovlp, &d_edges, 1);///polish - // print_debug_gfa(ug, sg, coverage_cut, output_file_name, sources, ruIndex); - ul_resolve(ug, sg, &opt, asm_opt.polyploidy); - /** + hap_cov_t *cov = NULL; trans_chain* t_ch = NULL; if((asm_opt.flag & HA_F_VERBOSE_GFA)) t_ch = load_hc_trans(output_file_name); @@ -13959,7 +14547,6 @@ long long gap_fuzz, bub_label_t* b_mask_t) adjust_utg_by_primary(©_ug, copy_sg, TRIO_THRES, sources, reverse_sources, coverage_cut, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, &new_rtg_edges, &cov, b_mask_t, 1, 0); - print_utg(copy_ug, copy_sg, coverage_cut, output_file_name, sources, ruIndex, max_hang, min_ovlp, &new_rtg_edges); @@ -13968,11 +14555,10 @@ long long gap_fuzz, bub_label_t* b_mask_t) output_contig_graph_alternative(copy_sg, coverage_cut, output_file_name, sources, ruIndex, max_hang, min_ovlp); } - ma_ug_destroy(copy_ug); asg_destroy(copy_sg); + // clean_u_trans_t_idx(&(cov->t_ch->k_trans), ug, sg); - clean_u_trans_t_idx(&(cov->t_ch->k_trans), ug, sg); new_rtg_edges.a.n = 0; ma_ug_print_bed(ug, sg, &R_INF, coverage_cut, sources, &new_rtg_edges, @@ -13981,14 +14567,28 @@ long long gap_fuzz, bub_label_t* b_mask_t) if((asm_opt.flag & HA_F_VERBOSE_GFA)) write_trans_chain(cov->t_ch, output_file_name); } - // char* gfa_name = (char*)malloc(strlen(output_file_name)+50); + refine_hic_trans(opt, &(cov?cov->t_ch->k_trans:t_ch->k_trans), sg, ug); + ///for debug + // char* gfa_name = (char*)malloc(strlen(output_file_name)+50); FILE* output_file = NULL; + // sprintf(gfa_name, "%s.pre.clean_d_utg.noseq.gfa", output_file_name); - // FILE* output_file = fopen(gfa_name, "w"); + // output_file = fopen(gfa_name, "w"); // ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); // fclose(output_file); + + // sprintf(gfa_name, "%s.pre.clean_d_utg.gfa", output_file_name); + // output_file = fopen(gfa_name, "w"); + // ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); + // fclose(output_file); + // free(gfa_name); + // exit(1); + ///for debug + + + + hic_analysis(ug, sg, cov?cov->t_ch:t_ch, opt, 0, asm_opt.scffold?&rhits:NULL); - hic_analysis(ug, sg, cov?cov->t_ch:t_ch, &opt, 0, asm_opt.scffold?&rhits:NULL); if(!rhits && cov) destory_hap_cov_t(&cov); if(!rhits && t_ch) destory_trans_chain(&t_ch); @@ -14029,18 +14629,34 @@ long long gap_fuzz, bub_label_t* b_mask_t) reduce_hamming_error(sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz); ug_fa = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL); + 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL, NULL, NULL); ug_mo = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL); + 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL, NULL, NULL); if(rhits) { - ha_aware_order(rhits, sg, ug_fa, ug_mo, cov?&(cov->t_ch->k_trans):&(t_ch->k_trans), &opt, 3); + ha_aware_order(rhits, sg, ug_fa, ug_mo, cov?&(cov->t_ch->k_trans):&(t_ch->k_trans), opt, 3); kv_destroy(rhits->a); kv_destroy(rhits->idx); kv_destroy(rhits->occ); free(rhits); if(cov) destory_hap_cov_t(&cov); if(t_ch) destory_trans_chain(&t_ch); ma_ug_destroy(ug_fa); ma_ug_destroy(ug_mo); } - **/ +} + +void print_debug_gfa(ma_ug_t *ug, asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, +ma_hit_t_alloc* sources, R_to_U* ruIndex) +{ + char* gfa_name = (char*)malloc(strlen(output_file_name)+50); + sprintf(gfa_name, "%s.after.clean_d_utg.gfa", output_file_name); + FILE* output_file = fopen(gfa_name, "w"); + ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); + fclose(output_file); + + sprintf(gfa_name, "%s.after.clean_d_utg.noseq.gfa", output_file_name); + output_file = fopen(gfa_name, "w"); + ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); + fclose(output_file); + + free(gfa_name); } ma_ug_t *get_poly_ug(asg_t *sg, ma_sub_t* coverage_cut, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, @@ -14544,37 +15160,26 @@ void kt_u_trans_t_symm(kv_u_trans_t *ta, ma_ug_t *ug) if (i == n || a[i].tn != a[st].tn) { get_u_trans_spec(ta, a[st].tn, a[st].qn, &r_a, &r_n); - if(i - st == 1 && r_n == 0)///should be always here - { + if(i == st + 1 && r_n == 0) {///should be always here st = i; continue; } - e0.n = e1.n = 0; - for (m = st; m < i; m++) - { + for (m = st; m < i; m++) { if(a[m].del) continue; - if(a[m].rev) - { + if(a[m].rev) { kv_push(u_trans_t, e1, a[m]); - } - else - { + } else { kv_push(u_trans_t, e0, a[m]); } } - - for (m = 0; m < r_n; m++) - { + for (m = 0; m < r_n; m++) { if(r_a[m].del) continue; - if(r_a[m].rev) - { + if(r_a[m].rev) { kv_pushp(u_trans_t, e1, &p); - } - else - { + } else { kv_pushp(u_trans_t, e0, &p); } (*p) = r_a[m]; @@ -14583,8 +15188,7 @@ void kt_u_trans_t_symm(kv_u_trans_t *ta, ma_ug_t *ug) } - if(e0.n + e1.n > 1) - { + if(e0.n + e1.n > 1) { ///must be here for (m = st; m < i; m++) a[m].del = 1; for (m = 0; m < r_n; m++) r_a[m].del = 1; @@ -14882,21 +15486,6 @@ void filter_u_trans_t(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, uint32_t thr kt_u_trans_t_idx(ta, ug->g->n_seq); } - -void dbg_prt_utg_trans(kv_u_trans_t *ta, ma_ug_t *ug, const char *o_n) -{ - char* gfa_name = (char*)malloc(strlen(o_n)+100); - FILE *fn = NULL; sprintf(gfa_name, "%s.tran.dbg.ovlp.log", o_n); - u_trans_t *p = NULL; uint32_t i; fn = fopen(gfa_name, "w"); - for (i = 0; i < ta->n; i++) { - p = &(ta->a[i]); - fprintf(fn, "utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tw(%f)\tf(%u)\n", - p->qn+1, "lc"[ug->u.a[p->qn].circ], ug->u.a[p->qn].len, p->qs, p->qe, "+-"[p->rev], - p->tn+1, "lc"[ug->u.a[p->tn].circ], ug->u.a[p->tn].len, p->ts, p->te, p->nw, p->f); - } - fclose(fn); free(gfa_name); fprintf(stderr, "[M::%s::] done\n", __func__); -} - void clean_u_trans_t_idx(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g) { // dbg_prt_utg_trans(ta, ug, "pre"); @@ -14907,6 +15496,113 @@ void clean_u_trans_t_idx(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g) // dbg_prt_utg_trans(ta, ug, "after"); } +void gen_bp_phasing(ug_opt_t *opt, kv_u_trans_t *ta, ma_ug_t *ug, asg_t *sg) +{ + uint8_t *bf = NULL; + bubble_type *bub = gen_bubble_chain(sg, ug, opt, &bf); free(bf); + filter_u_trans(ta, asm_opt.is_bub_trans, asm_opt.is_topo_trans, asm_opt.is_read_trans, asm_opt.is_base_trans); + // dbg_prt_utg_trans(ta, ug, "pre"); + if(asm_opt.is_base_trans) { + trans_base_infer(ug, sg, opt, ta, bub); + } + // dbg_prt_utg_trans(ta, ug, "after"); + clean_u_trans_t_idx_filter_adv(ta, ug, sg); + + + bp_solve(opt, ta, ug, sg, bub, asm_opt.trans_base_rate); + + destory_bubbles(bub); free(bub); +} + + +void output_bp_graph_adv(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, +ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, +long long tipsLen, float tip_drop_ratio, long long stops_threshold, +R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, +bub_label_t* b_mask_t, ug_opt_t *opt) +{ + hic_clean(sg); + + kvec_asg_arc_t_warp new_rtg_edges, d_edges; + kv_init(new_rtg_edges.a); kv_init(d_edges.a); + ma_ug_t *ug = NULL; + ug = ma_ug_gen_primary(sg, PRIMARY_LABLE); + + new_rtg_edges.a.n = 0; + ma_ug_seq(ug, sg, coverage_cut, sources, &new_rtg_edges, max_hang, min_ovlp, &d_edges, 1);///polish + + new_rtg_edges.a.n = 0; + hap_cov_t *cov = NULL; + asg_t *copy_sg = copy_read_graph(sg); + ma_ug_t *copy_ug = copy_untig_graph(ug); + /*******************************for debug************************************/ + adjust_utg_by_primary(©_ug, copy_sg, TRIO_THRES, sources, reverse_sources, coverage_cut, + tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, + max_hang, min_ovlp, &new_rtg_edges, &cov, b_mask_t, 1, 0/**1**/); + // adjust_utg_advance(copy_sg, copy_ug, reverse_sources, ruIndex, b_mask_t); + // get_utg_ovlp(©_ug, copy_sg, sources, reverse_sources, coverage_cut, + // ruIndex, max_hang, min_ovlp, &new_rtg_edges, b_mask_t, NULL); + // exit(1); + /*******************************for debug************************************/ + print_utg(copy_ug, copy_sg, coverage_cut, output_file_name, sources, ruIndex, max_hang, + min_ovlp, &new_rtg_edges); + ma_ug_destroy(copy_ug); + asg_destroy(copy_sg); + + // gen_bp_phasing(opt, &(cov->t_ch->k_trans), ug, sg); + clean_u_trans_t_idx(&(cov->t_ch->k_trans), ug, sg); + set_trio_flag_by_cov(ug, sg, cov); + + //for debug + // char* gfa_name = (char*)malloc(strlen(output_file_name)+50); + // sprintf(gfa_name, "%s.pre.clean_d_utg.noseq.gfa", output_file_name); + // FILE* output_file = fopen(gfa_name, "w"); + // ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file); + // fclose(output_file); + // free(gfa_name); + // exit(1); + //for debug + + + // print_untig_by_read(copy_ug, "m64011_190830_220126/88867583/ccs", 603738, NULL, NULL, "sb"); + + ///debug_gfa_space(ug, cov); + + + // print_r_het(cov, R_INF.trio_flag, "out-1"); + // print_debug_gfa(ug, sg, coverage_cut, output_file_name, sources, ruIndex); + + destory_hap_cov_t(&cov); + ma_ug_destroy(ug); + kv_destroy(new_rtg_edges.a); + + asg_arc_t* av = NULL; + uint32_t v, w, k, i, nv; + for (i = 0; i < d_edges.a.n; i++) + { + v = d_edges.a.a[i].ul>>32; + w = d_edges.a.a[i].v; + av = asg_arc_a(sg, v); + nv = asg_arc_n(sg, v); + for (k = 0; k < nv; k++) + { + if(av[k].del) continue; + if(av[k].v == w) + { + av[k].del = 1; + break; + } + } + } + kv_destroy(d_edges.a); + asg_cleanup(sg); + + + output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); + output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); +} void output_bp_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, @@ -31983,11 +32679,17 @@ ma_sub_t **coverage_cut_ptr, int debug_g) free(unlean_name); } + + // if (asm_opt.flag & HA_F_VERBOSE_GFA) { + // write_debug_graph(sg, sources, coverage_cut, output_file_name, reverse_sources, ruIndex, &UL_INF); + // debug_gfa:; + // } + gen_ug_opt_t(&uopt, sources, reverse_sources, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, readLen, coverage_cut, ruIndex, (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t); ul_clean_gfa(&uopt, sg, sources, reverse_sources, ruIndex, clean_round, min_ovlp_drop_ratio, max_ovlp_drop_ratio, 0.6, asm_opt.max_short_tip, gap_fuzz, &b_mask_t, !!asm_opt.ar, ha_opt_triobin(&asm_opt), UL_COV_THRES, o_file); - + ///@brief debug if (asm_opt.flag & HA_F_VERBOSE_GFA) { write_debug_graph(sg, sources, coverage_cut, output_file_name, reverse_sources, ruIndex, &UL_INF); debug_gfa:; @@ -31995,11 +32697,14 @@ ma_sub_t **coverage_cut_ptr, int debug_g) (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t); // set_hom_global_coverage(&asm_opt, sg, coverage_cut, sources, reverse_sources, ruIndex, max_hang_length, mini_overlap_length); } + if(asm_opt.ar) { renew_g(&sources, &reverse_sources, &n_read, &readLen, &coverage_cut, ruIndex, &sg, mini_overlap_length, max_hang_length, &uopt, clean_round, min_ovlp_drop_ratio, max_ovlp_drop_ratio, asm_opt.max_short_tip, &b_mask_t, ha_opt_triobin(&asm_opt), o_file); - + ///make sure uopt has been updated; not necessary + gen_ug_opt_t(&uopt, sources, reverse_sources, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, readLen, coverage_cut, ruIndex, + (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t); // ma_ug_t *iug = ul_realignment_gfa(&uopt, sg, clean_round, min_ovlp_drop_ratio, max_ovlp_drop_ratio, // asm_opt.max_short_tip, &b_mask_t, ha_opt_triobin(&asm_opt), o_file); // gen_ng(iug, sg, &uopt, &coverage_cut, ruIndex, 100); @@ -32135,6 +32840,8 @@ ma_sub_t **coverage_cut_ptr, int debug_g) // flat_bubbles(sg, ruIndex->is_het); free(ruIndex->is_het); ruIndex->is_het = NULL; flat_soma_v(sg, sources, ruIndex); **/ + ug_ext_gfa(&uopt, sg, ug_ext_len); + if(!ha_opt_triobin(&asm_opt)) { // output_unitig_graph(sg, coverage_cut, "pre_clean", sources, ruIndex, max_hang_length, mini_overlap_length); hic_clean_adv(sg, &uopt); @@ -32154,13 +32861,7 @@ ma_sub_t **coverage_cut_ptr, int debug_g) if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; output_poly_trio(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, asm_opt.polyploidy); - }/** - else if(asm_opt.ar) - { - if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; - output_ul_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), - 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t); - }**/ + } else if (ha_opt_triobin(&asm_opt) && ha_opt_hic(&asm_opt)) { if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; @@ -32177,14 +32878,14 @@ ma_sub_t **coverage_cut_ptr, int debug_g) { if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; output_hic_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), - 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t); + 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t, &uopt); // output_hic_graph_polyploid(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), // 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t); } else if((asm_opt.flag & HA_F_PARTITION) && (asm_opt.purge_level_primary > 0)) { output_bp_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), - 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, &b_mask_t); + 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, &b_mask_t/**, &uopt**/); } else { diff --git a/Overlaps.h b/Overlaps.h index ba9da7a..c3b3554 100644 --- a/Overlaps.h +++ b/Overlaps.h @@ -33,6 +33,7 @@ // #define ALTER_LABLE 2 // #define HAP_LABLE 4 #define HA_RE_UL_ID "re" +#define ug_ext_len 75000 #define Get_qn(RECORD) ((uint32_t)((RECORD).qns>>32)) #define Get_qs(RECORD) ((uint32_t)((RECORD).qns)) @@ -1135,6 +1136,8 @@ asg_t *i_read_sg, trans_chain* i_t_ch, uint32_t i_cBeg, uint32_t i_cEnd); void extract_sub_overlaps(uint32_t i_tScur, uint32_t i_tEcur, uint32_t i_tSpre, uint32_t i_tEpre, uint32_t tn, kv_u_trans_hit_t* ktb, uint32_t bn); void clean_u_trans_t_idx(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); +void clean_u_trans_t_idx_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); +void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); uint32_t test_dbug(ma_ug_t* ug, FILE* fp); void write_dbug(ma_ug_t* ug, FILE* fp); int asg_arc_identify_simple_bubbles_multi(asg_t *g, bub_label_t* x, int check_cross); diff --git a/anchor.cpp b/anchor.cpp index e497b82..6722baf 100644 --- a/anchor.cpp +++ b/anchor.cpp @@ -5,8 +5,12 @@ #include "ksort.h" #include "Hash_Table.h" #include "kalloc.h" +#include "Overlaps.h" #define HA_KMER_GOOD_RATIO 0.333 +#define OFL 0.95 +#define CH_OCC 4 +#define CH_SC 16 typedef struct { // this struct is not strictly necessary; we can use k_mer_pos instead, with modifications uint64_t srt; @@ -21,7 +25,8 @@ typedef struct { // this struct is not strictly necessary; we can use k_mer_pos KRADIX_SORT_INIT(ha_an1, anchor1_t, an_key1, 8) KRADIX_SORT_INIT(ha_an2, anchor1_t, an_key2, 4) KRADIX_SORT_INIT(ha_an3, anchor1_t, an_key3, 4) - +#define generic_key(x) (x) +KRADIX_SORT_INIT(anc64, uint64_t, generic_key, 8) #define oreg_xs_lt(a, b) (((uint64_t)(a).x_pos_s<<32|(a).x_pos_e) < ((uint64_t)(b).x_pos_s<<32|(b).x_pos_e)) KSORT_INIT(or_xs, overlap_region, oreg_xs_lt) @@ -29,6 +34,9 @@ KSORT_INIT(or_xs, overlap_region, oreg_xs_lt) #define oreg_ss_lt(a, b) ((a).shared_seed > (b).shared_seed) // in the decending order KSORT_INIT(or_ss, overlap_region, oreg_ss_lt) +#define oreg_occ_lt(a, b) ((a).align_length > (b).align_length) // in the decending order +KSORT_INIT(or_occ, overlap_region, oreg_occ_lt) + #define oreg_id_lt(a, b) ((a).y_id < (b).y_id) KSORT_INIT(or_id, overlap_region, oreg_id_lt) @@ -971,6 +979,275 @@ uint32_t *low_occ) cl->length = ab->n_a; } +void gen_pair_chain(ha_abufl_t *ab, uint64_t rid, st_mt_t *tid, uint64_t tid_n, ha_mzl_t *in, uint64_t in_n, ha_mzl_t *idx, int64_t idx_n, uint64_t mzl_cutoff) +{ + if(!tid_n) return; + uint64_t i, l, k, n, m, rev, x, tn, kn; ha_mzl_t *z, *p, *y; int64_t zi, ns, ne; anchor1_t *an; + for (i = tn = 0; i < in_n; ++i) { + ///z is one of the minimizer + z = &in[i]; p = &(idx[z->x]); n = 0; + assert(z->pos == p->pos && z->rid == p->rid && z->span == p->span && z->rev == p->rev); + for (zi = z->x+1; zi < idx_n && idx[zi].x == p->x && n < mzl_cutoff; zi++) { + if(idx[zi].rid == rid) continue; + x = idx[zi].rid; x <<= 1; x |= ((uint64_t)((z->rev == idx[zi].rev)?0:1)); + for (m = 0; m < tid_n; m++) { + if(tid->a[m] == x) { + n++; break; + } + } + } + for (zi = ((int64_t)z->x)-1; zi >= 0 && idx[zi].x == p->x && n < mzl_cutoff; zi--) { + if(idx[zi].rid == rid) continue; + x = idx[zi].rid; x <<= 1; x |= ((uint64_t)((z->rev == idx[zi].rev)?0:1)); + for (m = 0; m < tid_n; m++) { + if(tid->a[m] == x) { + n++; break; + } + } + } + if((!n) || (n >= mzl_cutoff)) continue; + tn += n; + } + if(!tn) return; + ab->n_a += tn; + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + + for (i = 0, k = ab->n_a - tn; i < in_n; ++i) { + ///z is one of the minimizer + z = &in[i]; p = &(idx[z->x]); n = 0; ns = 0; ne = -1; + for (zi = z->x+1; zi < idx_n && idx[zi].x == p->x && n < mzl_cutoff; zi++) { + if(idx[zi].rid == rid) continue; + x = idx[zi].rid; x <<= 1; x |= ((uint64_t)((z->rev == idx[zi].rev)?0:1)); + for (m = 0; m < tid_n; m++) { + if(tid->a[m] == x) { + n++; break; + } + } + } + ne = zi; + for (zi = ((int64_t)z->x)-1; zi >= 0 && idx[zi].x == p->x && n < mzl_cutoff; zi--) { + if(idx[zi].rid == rid) continue; + x = idx[zi].rid; x <<= 1; x |= ((uint64_t)((z->rev == idx[zi].rev)?0:1)); + for (m = 0; m < tid_n; m++) { + if(tid->a[m] == x) { + n++; break; + } + } + } + ns = zi + 1; + if((!n) || (n >= mzl_cutoff)) continue; + n = ne - ns; if(n > ((uint32_t)(0xffffffu))) n = 0xffffffu; n<<=8; + + for (zi = z->x+1; zi < idx_n && idx[zi].x == p->x; zi++) { + if(idx[zi].rid == rid) continue; + x = idx[zi].rid; x <<= 1; x |= ((uint64_t)((z->rev == idx[zi].rev)?0:1)); + for (m = 0; m < tid_n; m++) { + if(tid->a[m] == x) break; + } + if(m >= tid_n) continue; + + + y = &(idx[zi]); an = &ab->a[k++]; + rev = z->rev == y->rev? 0 : 1; + an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->self_off = z->pos; + ///an->cnt: cnt<<8|span + an->cnt = ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); an->cnt |= n; + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + } + for (zi = ((int64_t)z->x)-1; zi >= 0 && idx[zi].x == p->x; zi--) { + if(idx[zi].rid == rid) continue; + x = idx[zi].rid; x <<= 1; x |= ((uint64_t)((z->rev == idx[zi].rev)?0:1)); + for (m = 0; m < tid_n; m++) { + if(tid->a[m] == x) break; + } + if(m >= tid_n) continue; + + + + y = &(idx[zi]); an = &ab->a[k++]; + rev = z->rev == y->rev? 0 : 1; + an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->self_off = z->pos; + ///an->cnt: cnt<<8|span + an->cnt = ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); an->cnt |= n; + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + } + } + assert(k == ab->n_a); + radix_sort_ha_an1(ab->a + ab->n_a - tn, ab->a + ab->n_a); kn = 0; + for (k = ab->n_a - tn + 1, l = ab->n_a - tn; k <= ab->n_a; ++k) { + if (k == ab->n_a || (ab->a[k].srt>>32) != (ab->a[l].srt>>32)) { + for (; kn < tid_n && tid->a[kn] < (ab->a[l].srt>>32); kn++); + if(kn < tid_n && tid->a[kn] == (ab->a[l].srt>>32)) kv_push(uint64_t, *tid, l); + for (; kn < tid_n && tid->a[kn] == (ab->a[l].srt>>32); kn++); + l = k; + } + } +} + +void minimizers_qgen_input(ha_abufl_t *ab, uint64_t rid, char* rs, int64_t rl, uint64_t mz_w, uint64_t mz_k, Candidates_list *cl, kvec_t_u8_warp* k_flag, +void *ha_flt_tab, ha_pt_t *ha_idx, All_reads* rdb, const ul_idx_t *udb, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, +uint32_t *low_occ, ha_mzl_t *in, uint64_t in_n, ha_mzl_t *idx, int64_t idx_n, uint64_t mzl_cutoff, uint64_t chain_cutoff, kv_u_trans_t *kov) +{ + // fprintf(stderr, "+[M::%s]\n", __func__); + uint64_t i, k, l, max_cnt = UINT32_MAX, min_cnt = 0, n; ha_mzl_t *z, *p, *y; anchor1_t *an; + int64_t zi, ns, ne; uint8_t rev; k_mer_hit *s; + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + clear_Candidates_list(cl); + + for (i = 0, ab->n_a = 0; i < in_n; ++i) { + ///z is one of the minimizer + z = &in[i]; p = &(idx[z->x]); n = 0; + assert(z->pos == p->pos && z->rid == p->rid && z->span == p->span && z->rev == p->rev); + for (zi = z->x+1; zi < idx_n && idx[zi].x == p->x && n < mzl_cutoff; zi++) { + if(idx[zi].rid == rid) continue; n++; + } + for (zi = ((int64_t)z->x)-1; zi >= 0 && idx[zi].x == p->x && n < mzl_cutoff; zi--) { + if(idx[zi].rid == rid) continue; n++; + } + if((!n) || (n >= mzl_cutoff)) continue; + ab->n_a += n; + } + // if(rid == 0) { + // fprintf(stderr, "-0-[M::%s] ab->n_a::%lu, in_n::%lu\n", __func__, ab->n_a, in_n); + // } + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + for (i = 0, k = 0; i < in_n; ++i) { + ///z is one of the minimizer + z = &in[i]; p = &(idx[z->x]); n = 0; ns = 0; ne = -1; + for (zi = z->x+1; zi < idx_n && idx[zi].x == p->x && n < mzl_cutoff; zi++) { + if(idx[zi].rid == rid) continue; n++; + } + ne = zi; + for (zi = ((int64_t)z->x)-1; zi >= 0 && idx[zi].x == p->x && n < mzl_cutoff; zi--) { + if(idx[zi].rid == rid) continue; n++; + } + ns = zi + 1; + if((!n) || (n >= mzl_cutoff)) continue; + n = ne - ns; if(n > ((uint32_t)(0xffffffu))) n = 0xffffffu; n<<=8; + + for (zi = z->x+1; zi < idx_n && idx[zi].x == p->x; zi++) { + if(idx[zi].rid == rid) continue; + y = &(idx[zi]); an = &ab->a[k++]; + rev = z->rev == y->rev? 0 : 1; + an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->self_off = z->pos; + ///an->cnt: cnt<<8|span + an->cnt = ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); an->cnt |= n; + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + } + for (zi = ((int64_t)z->x)-1; zi >= 0 && idx[zi].x == p->x; zi--) { + if(idx[zi].rid == rid) continue; + y = &(idx[zi]); an = &ab->a[k++]; + rev = z->rev == y->rev? 0 : 1; + an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->self_off = z->pos; + ///an->cnt: cnt<<8|span + an->cnt = ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); an->cnt |= n; + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + } + } + assert(k == ab->n_a); + + uint64_t kn, spn; u_trans_t *ka; + kn = spn = 0; ka = NULL; sp->n = 0; + if(kov) { + kn = u_trans_n(*kov, rid); ka = u_trans_a(*kov, rid); + } + if(kn > 0) { + kv_resize(uint64_t, *sp, kn); + for (k = 0; k < kn; ++k) { + if(ka[k].del) continue; + l = ka[k].tn; l <<= 1; l |= ka[k].rev; + kv_push(uint64_t, *sp, l); + } + if(sp->n > 1) radix_sort_anc64(sp->a, sp->a + sp->n); + } + + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); kn = 0; + for (k = 1, l = n = 0, spn = sp->n; k <= ab->n_a; ++k) { + if (k == ab->n_a || (ab->a[k].srt>>32) != (ab->a[l].srt>>32)) { + if(k - l >= chain_cutoff) { + for (; kn < spn && sp->a[kn] < (ab->a[l].srt>>32); kn++); + if(kn < spn && sp->a[kn] == (ab->a[l].srt>>32)) kv_push(uint64_t, *sp, n); + for (; kn < spn && sp->a[kn] == (ab->a[l].srt>>32); kn++) sp->a[kn] = (uint64_t)-1; + for (i = l; i < k; i++) ab->a[n++] = ab->a[i]; + } + l = k; + } + } + ab->n_a = n; + for (k = kn = 0; k < spn; k++) { + if(sp->a[k] == (uint64_t)-1) continue; + sp->a[kn++] = sp->a[k]; + } + // sp->n = kn; + if(kn > 0) gen_pair_chain(ab, rid, sp, kn, in, in_n, idx, idx_n, mzl_cutoff); + if(spn > 0) { + for (k = spn, kn = 0; k < sp->n; k++) sp->a[kn++] = sp->a[k]; + sp->n = kn; + } + + + // copy over to _cl_ + if (ab->n_a >= (uint64_t)cl->size) { + cl->size = ab->n_a; + REALLOC(cl->list, cl->size); + } + + uint64_t tid = (uint64_t)-1, tl = (uint64_t)-1; + for (k = 1, l = 0; k <= ab->n_a; ++k) { + if (k == ab->n_a || ab->a[k].srt != ab->a[l].srt) { + if (k-l>1) radix_sort_ha_an3(ab->a+l, ab->a+k); + if((ab->a[l].srt>>33)!=tid) { + tid = ab->a[l].srt>>33; + tl = rdb?Get_READ_LENGTH((*rdb), tid):udb->ug->u.a[tid].len; + } + for (i = l; i < k; i++) { + s = &cl->list[i]; + s->readID = ab->a[i].srt>>33; + s->strand = (ab->a[i].srt>>32)&1; + if(!(s->strand)) { + s->offset = ab->a[i].other_off; + } else { + s->offset = ((uint32_t)-1)-ab->a[i].other_off; + s->offset = tl-s->offset; + } + s->self_offset = ab->a[i].self_off; + if(((ab->a[i].cnt>>8) < max_cnt) && ((ab->a[i].cnt>>8) > min_cnt)){ + s->cnt = 1; + } else if((ab->a[i].cnt>>8) <= min_cnt) { + s->cnt = 2; + } else{ + s->cnt = 1 + (((ab->a[i].cnt>>8) + (max_cnt<<1) - 1)/(max_cnt<<1)); + s->cnt = pow(s->cnt, 1.1); + } + if(s->cnt > ((uint32_t)(0xffffffu))) s->cnt = 0xffffffu; + s->cnt <<= 8; s->cnt |= (((uint32_t)(0xffu))&(ab->a[i].cnt)); + } + l = k; + } + } + cl->length = ab->n_a; +} + void inline reverse_k_mer_hit(k_mer_hit *a, uint64_t a_n, uint64_t xl, uint64_t yl) { uint64_t z, han = a_n>>1; k_mer_hit *ai, *aj, ka; @@ -1192,18 +1469,250 @@ void lchain_qgen(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, ui void lchain_qgen_mcopy(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, const ul_idx_t *udb, uint32_t apend_be, uint64_t max_n_chain, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check, - uint32_t gen_off) + uint32_t gen_off, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, st_mt_t *sp) { // fprintf(stderr, "+[M::%s]\n", __func__); - uint64_t i, k, l, m, cn = cl->length, yid; overlap_region *r; ///srt = 0 + uint64_t i, k, l, m, cn = cl->length, yid, ol0, lch; overlap_region *r, t; ///srt = 0 clear_overlap_region_alloc(ol); - for (l = 0, k = 1, m = 0; k <= cn; k++) { + for (l = 0, k = 1, m = 0, lch = 0; k <= cn; k++) { if((k == cn) || (cl->list[k].readID != cl->list[l].readID)) { if(cl->list[l].readID != rid) { - yid = cl->list[l].readID; + yid = cl->list[l].readID; ol0 = ol->length; m += lchain_qdp_mcopy(cl, l, k-l, m, &(cl->chainDP), ol, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, - rid, rl, rdb?Get_READ_LENGTH((*rdb), yid):udb->ug->u.a[yid].len, quick_check, apend_be, gen_off); + rid, rl, rdb?Get_READ_LENGTH((*rdb), yid):udb->ug->u.a[yid].len, quick_check, apend_be, gen_off, 1, mcopy_rate, mcopy_khit_cut, 1); + if((chain_cutoff >= 2) && (!lch)) { + for (i = ol0; (ilength) && (!lch); i++) { + if(ol->list[i].align_length < chain_cutoff) lch = 1; + } + } + } + l = k; + } + } + cl->length = m; + + // for (k = 0; k < ol->length; k++) { + // fprintf(stderr, "---[M::%s::utg%.6dl] q[%d, %d), t[%d, %d), khit_off::%u\n", __func__, + // (int32_t)ol->list[k].y_id+1, ol->list[k].x_pos_s, ol->list[k].x_pos_e+1, + // ol->list[k].y_pos_s, ol->list[k].y_pos_e+1, ol->list[k].non_homopolymer_errors); + // } + + k = ol->length; + if (ol->length > max_n_chain) { + int32_t w, n[4], s[4]; + n[0] = n[1] = n[2] = n[3] = 0, s[0] = s[1] = s[2] = s[3] = 0; + ks_introsort_or_ss(ol->length, ol->list); + for (i = 0; i < ol->length; ++i) { + r = &(ol->list[i]); + w = ha_ov_type(r, rl); + ++n[w]; + if (((uint64_t)n[w]) == max_n_chain) s[w] = r->shared_seed; + } + if (s[0] > 0 || s[1] > 0 || s[2] > 0 || s[3] > 0) { + // n[0] = n[1] = n[2] = n[3] = 0; + for (i = 0, k = 0, lch = 0; i < ol->length; ++i) { + r = &(ol->list[i]); + w = ha_ov_type(r, rl); + // ++n[w]; + // if (((int)n[w] <= max_n_chain) || (r->shared_seed >= s[w] && s[w] >= (asm_opt.k_mer_length<<1))) { + if (r->shared_seed >= s[w]) { + if (k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + if(ol->list[k].align_length < chain_cutoff) lch = 1; + ++k; + } + } + ol->length = k; + } + } + + ks_introsort_or_xs(ol->length, ol->list); + if(lch) { + //@brief r485 + uint64_t zs, ze, rs, re, ob, os, oe, ocn, pp, kn, ms, me; int64_t osc; + for (i = l = 0, cn = cl->length; i < ol->length; ++i) { + if(ol->list[i].align_length < chain_cutoff) { + zs = ol->list[i].x_pos_s; ze = ol->list[i].x_pos_e + 1; + ob = (ze - zs)*OFL; if(ob < 16) ob = 16; + osc = ol->list[i].shared_seed*CH_SC; + ocn = ol->list[i].align_length<length) && (ze > ol->list[k].x_pos_s); k++) { + if(ol->list[k].align_length < chain_cutoff) continue; + if(ol->list[k].align_length < ocn) continue; + if(ol->list[k].shared_seed < osc) continue; + rs = ol->list[k].x_pos_s; re = ol->list[k].x_pos_e + 1; + os = ((rs>=zs)?rs:zs); oe = ((re<=ze)?re:ze); + if((oe > os) && (oe - os) >= ob) { + m = ol->list[k].non_homopolymer_errors; + pp = cl->list[m].readID; kn = 0; + for (; (m < cn) && (cl->list[m].readID == pp) && (kn < ocn); m++) { + me = cl->list[m].self_offset; ms = me - (cl->list[m].cnt&(0xffu)); + if((ms >= os) && (me <= oe)) kn++; + } + if(kn >= ocn) break; + } + } + if((k < ol->length) && (ze > ol->list[k].x_pos_s)) continue; + } + if (l != i) { + t = ol->list[l]; + ol->list[l] = ol->list[i]; + ol->list[i] = t; + } + l++; + } + // fprintf(stderr, "+[M::%s] rid::%u, ol->length0::%lu, ol->length1::%lu\n", __func__, rid, ol->length, l); + ol->length = l; + + + /** + //@brief r484 + for (i = sp->n = 0; i < ol->length; ++i) { + if(ol->list[i].align_length < chain_cutoff) continue; + os = ol->list[i].x_pos_s; oe = ol->list[i].x_pos_e + 1; + if((sp->n) && (((uint32_t)sp->a[sp->n-1]) >= os)) { + if(oe > ((uint32_t)sp->a[sp->n-1])) { + oe = oe - ((uint32_t)sp->a[sp->n-1]); + sp->a[sp->n-1] += oe; + } + } else { + os = (os<<32)|oe; kv_push(uint64_t, *sp, os); + } + } + + for (i = k = 0; i < ol->length; ++i) { + if(ol->list[i].align_length < chain_cutoff) {///ol has been sorted by x_pos_s + r = &(ol->list[i]); rs = r->x_pos_s; re = r->x_pos_e + 1; + rl = re - rs; ovl = 0; + for (m = 0; (m < sp->n) && (re > (sp->a[m]>>32)); m++) { + os = ((rs>=(sp->a[m]>>32))? rs:(sp->a[m]>>32)); + oe = ((re<=((uint32_t)sp->a[m]))? re:((uint32_t)sp->a[m])); + if(oe > os) { + ovl += (oe - os); if(ovl >= (rl*0.95)) break; + } + } + if(ovl >= (rl*0.95)) continue; + } + if (k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + ol->list[k++].align_length = 0; + } + // fprintf(stderr, "+[M::%s] ol->length0::%lu, ol->length1::%lu\n", __func__, ol->length, k); + ol->length = k; + **/ + } + /**else { + for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; + } + **/ + for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; +} + +inline uint64_t special_lchain(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, + const ul_idx_t *udb, uint32_t apend_be, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, + double chn_pen_skip, double bw_rate, int64_t quick_check, uint32_t gen_off, double mcopy_rate, uint32_t mcopy_khit_cut, + st_mt_t *sp, uint64_t *si, uint64_t m, uint64_t l, uint64_t k) +{ + uint64_t z, s, e, yid, ol0 = ol->length, ol1; + if(l >= k) return m; + yid = cl->list[l].readID; + for (z = l; z < k && cl->list[z].strand == cl->list[l].strand; z++); + if(z > l) { + s = l; e = z; ol1 = ol->length; // m0 = m; + m += lchain_qdp_mcopy(cl, s, e-s, m, &(cl->chainDP), ol, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, + rid, rl, rdb?Get_READ_LENGTH((*rdb), yid):udb->ug->u.a[yid].len, quick_check, apend_be, gen_off, 0, mcopy_rate, mcopy_khit_cut, 0); + // if(rid == 7) { + // fprintf(stderr, "+[M::%s::utg%.6ul] inn::[%lu, %lu), (*si)::%lu, ol1::%lu, ol->length::%lu\n", + // __func__, rid+1, s, e, (*si), ol1, ol->length); + // } + if(((*si) < sp->n) && (sp->a[(*si)] == s) && (ol->length > ol1)) { + ol->list[ol->length-1].x_pos_strand = 1; + // fprintf(stderr, "+[M::%s] utg%.6u%c -> utg%.6u%c, n_khits0::%lu, n_khits::%lu\n", __func__, rid+1, "lc"[udb->ug->u.a[rid].circ], + // ol->list[ol->length-1].y_id+1, "lc"[udb->ug->u.a[ol->list[ol->length-1].y_id].circ], e-s, m-m0); + } + for (; (*si) < sp->n && sp->a[(*si)] == s; (*si)++); + } + + if(z < k) { + s = z; e = k; ol1 = ol->length; // m0 = m; + m += lchain_qdp_mcopy(cl, s, e-s, m, &(cl->chainDP), ol, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, + rid, rl, rdb?Get_READ_LENGTH((*rdb), yid):udb->ug->u.a[yid].len, quick_check, apend_be, gen_off, 0, mcopy_rate, mcopy_khit_cut, 0); + // if(rid == 7) { + // fprintf(stderr, "-[M::%s::utg%.6ul] inn::[%lu, %lu), (*si)::%lu, ol1::%lu, ol->length::%lu\n", + // __func__, rid+1, s, e, (*si), ol1, ol->length); + // } + if(((*si) < sp->n) && (sp->a[(*si)] == s) && (ol->length > ol1)) { + ol->list[ol->length-1].x_pos_strand = 1; + // fprintf(stderr, "-[M::%s] utg%.6u%c -> utg%.6u%c, n_khits0::%lu, n_khits::%lu\n", __func__, rid+1, "lc"[udb->ug->u.a[rid].circ], + // ol->list[ol->length-1].y_id+1, "lc"[udb->ug->u.a[ol->list[ol->length-1].y_id].circ], e-s, m-m0); + } + for (; (*si) < sp->n && sp->a[(*si)] == s; (*si)++); + } + if(ol->length > ol0) { + overlap_region *p = &(ol->list[ol0]); uint64_t pi = ol0; overlap_region t; + for (z = ol0; z < ol->length; z++) { + if((p->shared_seed < ol->list[z].shared_seed) || + ((p->shared_seed == ol->list[z].shared_seed) && (ol->list[z].x_pos_strand == 1))) { + p = &(ol->list[z]); pi = z; + } + } + + // if(rid == 7) { + // fprintf(stderr, ">[M::%s::] utg%.6ul\t->\tutg%.6ul\tflag::%u\n", + // __func__, p->x_id+1, p->y_id+1, p->x_pos_strand); + // } + + for (z = s = ol0; z < ol->length; z++) { + if((ol->list[z].x_pos_strand == 0) && (z != pi)) continue; + if(s != z) { + t = ol->list[s]; + ol->list[s] = ol->list[z]; + ol->list[z] = t; + } + s++; + } + ol->length = s; + } + return m; +} + +void lchain_qgen_mcopy_input(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, + const ul_idx_t *udb, uint32_t apend_be, uint64_t max_n_chain, int64_t max_skip, int64_t max_iter, + int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, double bw_thres_sec, + int64_t quick_check, uint32_t gen_off, double mcopy_rate, uint32_t mcopy_khit_cut, st_mt_t *sp) +{ + // fprintf(stderr, "+[M::%s]\n", __func__); + uint64_t i, k, l, m, cn = cl->length, yid, si; overlap_region *r; ///srt = 0 + clear_overlap_region_alloc(ol); + + // if(rid == 160) { + // fprintf(stderr, "[M::%s::utg%.6ul] sp->n::%u\n", __func__, rid+1, (uint32_t)sp->n); + // } + for (l = 0, k = 1, m = 0, si = 0; k <= cn; k++) { + if((k == cn) || (cl->list[k].readID != cl->list[l].readID)) { + if(cl->list[l].readID != rid) { + yid = cl->list[l].readID; + // if(rid == 160) { + // fprintf(stderr, "+[M::%s::utg%.6lul] [%lu, %lu), m::%lu, ol->length::%lu\n", __func__, yid+1, l, k, m, ol->length); + // } + for (; si < sp->n && sp->a[si] < l; si++); + if(si >= sp->n || sp->a[si] >= k) {///might be unmatched + m += lchain_qdp_mcopy(cl, l, k-l, m, &(cl->chainDP), ol, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, + rid, rl, rdb?Get_READ_LENGTH((*rdb), yid):udb->ug->u.a[yid].len, quick_check, apend_be, gen_off, 0, mcopy_rate, mcopy_khit_cut, 0); + } else { + m = special_lchain(cl, ol, rid, rl, rdb, udb, apend_be, max_skip, max_iter, max_dis, chn_pen_gap, + chn_pen_skip, bw_thres_sec, quick_check, gen_off, mcopy_rate, mcopy_khit_cut, sp, &si, m, l, k); + } + // if(rid == 160) { + // fprintf(stderr, "-[M::%s::utg%.6lul] [%lu, %lu), m::%lu, ol->length::%lu\n", __func__, yid+1, l, k, m, ol->length); + // } } l = k; } @@ -1234,7 +1743,7 @@ void lchain_qgen_mcopy(Candidates_list* cl, overlap_region_alloc* ol, uint32_t r w = ha_ov_type(r, rl); // ++n[w]; // if (((int)n[w] <= max_n_chain) || (r->shared_seed >= s[w] && s[w] >= (asm_opt.k_mer_length<<1))) { - if (r->shared_seed >= s[w]) { + if ((r->shared_seed >= s[w]) || (r->x_pos_strand == 1)) { if (k != i) { t = ol->list[k]; ol->list[k] = ol->list[i]; @@ -1265,7 +1774,7 @@ void set_lchain_dp_op(uint32_t is_accurate, uint32_t mz_k, int64_t *max_skip, in } void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off) + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut) { extern void *ha_flt_tab; extern ha_pt_t *ha_idx; @@ -1276,18 +1785,24 @@ void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t // lchain_gen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); // lchain_qgen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); ///no need to sort here, overlap_list has been sorted at lchain_gen - lchain_qgen_mcopy(cl, overlap_list, rid, rl, NULL, uref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); + lchain_qgen_mcopy(cl, overlap_list, rid, rl, NULL, uref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); } -void ug_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, uint32_t is_hpc) +int64_t ug_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, double bw_thres_sec, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, + uint32_t gen_off, double mcopy_rate, uint32_t mcopy_khit_cut, uint32_t is_hpc, ha_mzl_t *res, uint64_t res_n, ha_mzl_t *idx, uint64_t idx_n, uint64_t mzl_cutoff, uint64_t chain_cutoff, kv_u_trans_t *kov) { int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); if((!overlap_list) || (!cl)) { ab->mz.n = 0; - mz2_ha_sketch(rs, rl, mz_w, mz_k, 0, is_hpc, &ab->mz, NULL, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL); + mz2_ha_sketch(rs, rl, mz_w, mz_k, rid, is_hpc, &ab->mz, NULL, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL); + if(res) memcpy(res, ab->mz.a, ab->mz.n * (sizeof((*(ab->mz.a))))); + return ab->mz.n; } else { - + sp->n = 0; + minimizers_qgen_input(ab, rid, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, NULL, uref, dbg_ct, sp, high_occ, low_occ, res, res_n, idx, idx_n, mzl_cutoff, chain_cutoff, kov); + lchain_qgen_mcopy_input(cl, overlap_list, rid, rl, NULL, uref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, bw_thres_sec, quick_check, gen_off, mcopy_rate, mcopy_khit_cut, sp); + return 0; } } diff --git a/gchain_map.cpp b/gchain_map.cpp index eb4b25f..41bb0fb 100644 --- a/gchain_map.cpp +++ b/gchain_map.cpp @@ -18,8 +18,8 @@ #include "Assembly.h" #include "gchain_map.h" KSEQ_INIT(gzFile, gzread) -void ul_map_lchain(ha_abufl_t *ab, int64_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, overlap_region_alloc *overlap_list_hp, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off); +void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); typedef struct { // global data structure for kt_pipeline() const void *ha_flt_tab; diff --git a/gfa_ut.cpp b/gfa_ut.cpp index 7892c67..720f8ed 100644 --- a/gfa_ut.cpp +++ b/gfa_ut.cpp @@ -2477,8 +2477,8 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i // (int)Get_NAME_LENGTH(R_INF, 10809), Get_NAME(R_INF, 10809), 10809, is_contain_r((*rI), 10809)); // fprintf(stderr, "%.*s\tid::%u\tis_c::%u\n", // (int)Get_NAME_LENGTH(R_INF, 10819), Get_NAME(R_INF, 10819), 10819, is_contain_r((*rI), 10819)); - // debug_info_of_specfic_node("m64012_190921_234837/111673711/ccs", sg, rI, "beg"); - // debug_info_of_specfic_node("m64011_190830_220126/95028102/ccs", sg, rI, "beg"); + // debug_info_of_specfic_node("m64011_190830_220126/47516220/ccs", sg, rI, "beg-0"); + // debug_info_of_specfic_node("m64012_190920_173625/163644465/ccs", sg, rI, "beg-0"); asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); // fprintf(stderr, "[M::%s] count_edges_v_w(sg, 49778, 49847)->%ld\n", __func__, count_edges_v_w(sg, 49778, 49847)); @@ -2491,12 +2491,11 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i // print_vw_edge(sg, 34156, 34090, "0"); // stats_chimeric(sg, src, &bu); if(!is_ou) asg_iterative_semi_circ(sg, src, &bu, max_tip, 1); - asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 1); asg_arc_cut_chimeric(sg, src, &bu, is_ou?ou_thres:(uint32_t)-1); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); // prt_specfic_sge(sg, 10531, 10519, "--1--"); - asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 0); asg_arc_cut_inexact(sg, src, &bu, max_tip, is_ou, is_trio, ou_drop_rate/**, NULL**//**&dbg**/); // debug_edges(&dbg, d, 2); @@ -2519,7 +2518,6 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i asg_arc_cut_complex_bub_links(sg, &bu, HARD_OL_DROP, HARD_OU_DROP, is_ou, b_mask_t); asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); // prt_specfic_sge(sg, 10531, 10519, "--5--"); - if(is_ou) { if(ul_refine_alignment(uopt, sg)) update_sg_uo(sg, src); @@ -2576,6 +2574,98 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i free(bu.a); free(ba.a); } +int32_t gen_ext_tip(uint32_t v, asg_t *sg, ug_opt_t *uopt, uint8_t *ff, uint32_t min_ovlp, uint64_t *res, uint64_t *path_len) +{ + uint32_t k, qn, tn, cc, ccu; int32_t r; asg_arc_t p, pmax; + ma_hit_t_alloc *s = &(uopt->sources[v>>1]); memset(&pmax, 0, sizeof(pmax)); pmax.ol = 0; + for (k = 0; k < s->length; k++) { + qn = Get_qn(s->buffer[k]); tn = Get_tn(s->buffer[k]); + r = ma_hit2arc(&(s->buffer[k]), sg->seq[qn].len, sg->seq[tn].len, uopt->max_hang, asm_opt.max_hang_rate, uopt->min_ovlp, &p); + if(r < 0) continue; + if((p.ul>>32) != v) continue; + if(p.ol < min_ovlp) continue; + if(p.ol > pmax.ol) pmax = p; + } + if(pmax.ol == 0) return 0; + + tn = pmax.v>>1; + if((!(sg->seq[tn].del)) || (ff[tn])) return -1;///not a tip + get_R_to_U(uopt->ruIndex, tn, &cc, &ccu); + if(ccu == 1) return -1; + if((cc != (uint32_t)-1) && + ((!(sg->seq[cc].del)) || (ff[cc]))) { + return -1;///contained in an existing read + } + + if((*path_len) >= sg->seq[v>>1].len) (*path_len) -= sg->seq[v>>1].len; + else (*path_len) = 0; + (*path_len) += (uint32_t)(pmax.ul) + sg->seq[pmax.v>>1].len; + + uint32_t v0 = v; (*res) = pmax.v; + v = (pmax.v^1); s = &(uopt->sources[v>>1]); pmax.ol = 0; + for (k = 0; k < s->length; k++) { + qn = Get_qn(s->buffer[k]); tn = Get_tn(s->buffer[k]); + r = ma_hit2arc(&(s->buffer[k]), sg->seq[qn].len, sg->seq[tn].len, uopt->max_hang, asm_opt.max_hang_rate, uopt->min_ovlp, &p); + if(r < 0) continue; + if((p.ul>>32) != v) continue; + if(p.ol < min_ovlp) continue; + if(p.ol > pmax.ol) pmax = p; + } + if((pmax.v^1) == v0) { + ff[(*res)>>1] = 1; return 1; + } + (*res) = (uint64_t)-1; return -1;///not the longest overlap +} + +void ug_ext_gfa(ug_opt_t *uopt, asg_t *sg, uint32_t max_len) +{ + asg_arc_t *av; asg64_v res, idx; kv_init(res); kv_init(idx); + uint32_t v, nv, k, z, nvtx = sg->n_seq<<1, tip_n = uopt->tipsLen + 1, s, e; + uint8_t *ff; CALLOC(ff, sg->n_seq); int32_t rr; uint64_t v0, w, bn, plen; + asg_arc_t t, *p; + for (v = 0; v < nvtx; v++) { + if(sg->seq[v>>1].del) continue; + av = asg_arc_a(sg, v); nv = asg_arc_n(sg, v); + for (k = 0; k < nv && av[k].del; k++); + if(k < nv) continue; + for (z = 0, v0 = v, rr = 0, bn = res.n, plen = sg->seq[v0>>1].len; z < tip_n || plen < max_len; z++) { + rr = gen_ext_tip(v0, sg, uopt, ff, 2000, &w, &plen); + if(rr <= 0) break; + kv_push(uint64_t, res, (v0<<32)|w); v0 = w; + } + if(rr < 0 || (z >= tip_n && plen >= max_len)) { + for (z = bn; z < res.n; z++) { + ff[((uint32_t)res.a[z])>>1] = 0; + } + res.n = bn; + } + if(res.n > bn) { + bn <<= 32; bn |= (uint64_t)res.n; + kv_push(uint64_t, idx, bn); + } + } + + if(idx.n > 0) { + for (k = 0; k < idx.n; k++) { + s = idx.a[k]>>32; e = (uint32_t)idx.a[k]; + for (z = s; z < e; z++) { + v = res.a[z]>>32; w = (uint32_t)res.a[z]; + assert((!sg->seq[v>>1].del) && (sg->seq[w>>1].del)); + sg->seq[w>>1].del = 0; + rr = gen_spec_edge(sg, uopt, v, w, &t); assert(rr >= 0); + p = asg_arc_pushp(sg); *p = t; + rr = gen_spec_edge(sg, uopt, w^1, v^1, &t); assert(rr >= 0); + p = asg_arc_pushp(sg); *p = t; + } + } + + free(sg->idx); sg->idx = 0; sg->is_srt = 0; asg_cleanup(sg); + fprintf(stderr, "[M::%s::] # tips::%u\n", __func__, (uint32_t)idx.n); + } + + free(ff); kv_destroy(res); kv_destroy(idx); +} + bubble_type *gen_bubble_chain(asg_t *sg, ma_ug_t *ug, ug_opt_t *uopt, uint8_t **ir_het) { @@ -2586,17 +2676,17 @@ bubble_type *gen_bubble_chain(asg_t *sg, ma_ug_t *ug, ug_opt_t *uopt, uint8_t ** asg_t *copy_sg = copy_read_graph(sg); ma_ug_t *copy_ug = copy_untig_graph(ug); - + // fprintf(stderr, "0[M::%s]\n", __func__); adjust_utg_by_primary(©_ug, copy_sg, TRIO_THRES, uopt->sources, uopt->reverse_sources, uopt->coverage_cut, uopt->tipsLen, uopt->tip_drop_ratio, uopt->stops_threshold, uopt->ruIndex, uopt->chimeric_rate, uopt->drop_ratio, uopt->max_hang, uopt->min_ovlp, &new_rtg_edges, &cov, uopt->b_mask_t, 0, 0); ma_ug_destroy(copy_ug); copy_ug = NULL; asg_destroy(copy_sg); copy_sg = NULL; - + // fprintf(stderr, "1[M::%s]\n", __func__); CALLOC(bub, 1); (*ir_het) = cov->t_ch->ir_het; cov->t_ch->ir_het = NULL; cov->is_r_het = NULL; identify_bubbles(ug, bub, (*ir_het), NULL); - + // fprintf(stderr, "2[M::%s]\n", __func__); kv_destroy(new_rtg_edges.a); destory_hap_cov_t(&cov); if(asm_opt.purge_level_primary == 0) {///all nodes are het uint32_t k; @@ -2604,6 +2694,7 @@ bubble_type *gen_bubble_chain(asg_t *sg, ma_ug_t *ug, ug_opt_t *uopt, uint8_t ** if(IF_HOM(k, *bub)) bub->index[k] = bub->f_bub+1; } } + // fprintf(stderr, "3[M::%s]\n", __func__); return bub; } @@ -9890,6 +9981,30 @@ static inline int usg_end(const usg_t *g, uint32_t v, uint64_t *lw) return ASG_ET_MERGEABLE; } +uint32_t usg_real_tip(usg_t *g, uint32_t v0, double rate) +{ + usg_arc_t *av, *aw; uint32_t nv, nw, i, k, ov, ow; + av = usg_arc_a(g, v0); + nv = usg_arc_n(g, v0); + for (i = 0; i < nv; i++) { + if(av[i].del) continue; + aw = usg_arc_a(g, (av[i].v^1)); + nw = usg_arc_n(g, (av[i].v^1)); + for (k = ov = ow = 0; k < nw; k++) { + if(aw[k].del) continue; + if((aw[k].v>>1) == (v0>>1)) { + if(aw[k].ol > ov) ov = aw[k].ol; + } else { + if(aw[k].ol > ow) ow = aw[k].ol; + } + } + if(ow == 0 || ov >= ow) return 0; + if(ov > (ow*rate)) return 0; + } + return 1; +} + + uint32_t usg_arc_cut_tips(usg_t *g, uint32_t max_ext, uint32_t ignore_ul, asg64_v *in) { asg64_v tx = {0,0,0}, *b = NULL; @@ -9940,7 +10055,7 @@ uint32_t usg_arc_cut_tips(usg_t *g, uint32_t max_ext, uint32_t ignore_ul, asg64_ if(kv <= max_ext) { ff = 0; - if(!ignore_ul) { + if(!ignore_ul) {///consider UL for (i = pb; i + 1 < b->n; i++) { p = get_usg_arc(g, ((uint32_t)b->a[i]), ((uint32_t)b->a[i+1])); assert(p); if(p->ou > 1) {//ignore ou == 1 @@ -9961,10 +10076,30 @@ uint32_t usg_arc_cut_tips(usg_t *g, uint32_t max_ext, uint32_t ignore_ul, asg64_ } } + // if((v>>1) == 139131) { + // fprintf(stderr, "+[M::%s::] v>>1::%u, v&1::%u, kv::%u, ff::%u, pb::%u, b->n::%u, w>>1::%u, w&1::%u\n", __func__, v>>1, v&1, kv, + // ff, pb, (uint32_t)b->n, ((uint32_t)b->a[b->n-1])>>1, ((uint32_t)b->a[b->n-1])&1); + // } + + if((ff == 0) && (pb < b->n) && (!usg_real_tip(g, ((uint32_t)b->a[b->n-1]), 0.75))) { + ff = 1; + } + + // if((v>>1) == 139131) { + // fprintf(stderr, "-[M::%s::] v>>1::%u, v&1::%u, kv::%u, ff::%u, pb::%u, b->n::%u, w>>1::%u, w&1::%u\n", __func__, v>>1, v&1, kv, + // ff, pb, (uint32_t)b->n, ((uint32_t)b->a[b->n-1])>>1, ((uint32_t)b->a[b->n-1])&1); + // } + if(ff == 0) { for (i = pb; i < b->n; i++) usg_seq_del(g, ((uint32_t)b->a[i])>>1); cnt++; } + + // if((v>>1) == 139131) { + // fprintf(stderr, ">[M::%s::] v>>1::%u, v&1::%u, kv::%u, ff::%u, pb::%u, b->n::%u, w>>1::%u, w&1::%u, del::%u\n", __func__, v>>1, v&1, kv, + // ff, pb, (uint32_t)b->n, ((uint32_t)b->a[b->n-1])>>1, ((uint32_t)b->a[b->n-1])&1, + // g->a[((uint32_t)b->a[b->n-1])>>1].del); + // } } b->n = pb; } @@ -13036,7 +13171,9 @@ void renew_usg_t_bub(ul_resolve_t *uidx, usg_t *ng, uint32_t *id_map, uint8_t *f if(k + 1 == u->n) id_map[v>>1] |= p[(v)&1]; // dn++; } + // fprintf(stderr, "+[M::%s] i::%u\n", __func__, i); merge_hybrid_utg_content(u, uidx->l1_ug, uidx->sg, ng, &e); + // fprintf(stderr, "-[M::%s] i::%u\n", __func__, i); ug->g->seq[i].len = u->len; } kv_destroy(e.a); @@ -13917,6 +14054,11 @@ static void gen_scaffold_id(void *data, long i, int tid) // callback for kt_for( uc_block_t *p, *n, *m0, *m1; sv = (v&1?((ug->u.a[v>>1].a[0]>>32)^1):(ug->u.a[v>>1].a[ug->u.a[v>>1].n-1]>>32)); sw = (w&1?((ug->u.a[w>>1].a[ug->u.a[w>>1].n-1]>>32)^1):(ug->u.a[w>>1].a[0]>>32)); + // if((((v>>1) == 15016) && ((w>>1) == 2778)) || (((v>>1) == 2778) && ((w>>1) == 15016))) { + // fprintf(stderr, "[M::%s::]\t%.*s(%c)(sv>>1::%u)(utg%.6ul(%c))\t%.*s(%c)(sw>>1::%u)(utg%.6ul(%c))\n", __func__, + // (int)Get_NAME_LENGTH(R_INF, (sv>>1)), Get_NAME(R_INF, (sv>>1)), "+-"[sv&1], sv>>1, (v>>1)+1, "+-"[v&1], + // (int)Get_NAME_LENGTH(R_INF, (sw>>1)), Get_NAME(R_INF, (sw>>1)), "+-"[sw&1], sw>>1, (w>>1)+1, "+-"[w&1]); + // } sv = Get_READ_LENGTH(R_INF, (sv>>1)); sw = Get_READ_LENGTH(R_INF, (sw>>1)); max_ql = MIN(sv, sw); max_ql = -max_ql; // if((((v>>1) == 235) && ((w>>1) == 153)) || (((v>>1) == 153) && ((w>>1) == 235))) { @@ -13942,8 +14084,8 @@ static void gen_scaffold_id(void *data, long i, int tid) // callback for kt_for( if(n->base || (!n->el) || (!n->pchain) || (n->pidx != (uint32_t)-1)) continue; uw = (((uint32_t)(n->hid))<<1)|((uint32_t)(n->rev)); if(uw == w) { - // if((((v>>1) == 235) && ((w>>1) == 153)) || (((v>>1) == 153) && ((w>>1) == 235))) { - // fprintf(stderr, "+[M::%s] ul_id::%lu\n", __func__, a[k]>>32); + // if((((v>>1) == 15016) && ((w>>1) == 2778)) || (((v>>1) == 2778) && ((w>>1) == 15016))) { + // fprintf(stderr, "+[M::%s] ul_id::%lu, qs::%u, qe::%u\n", __func__, a[k]>>32, n->qs, n->qe); // // print_ul_alignment(ug, &UL_INF, a[k]>>32, "+++"); // } ql = ((int64_t)n->qs) - ((int64_t)p->qe); @@ -13961,9 +14103,9 @@ static void gen_scaffold_id(void *data, long i, int tid) // callback for kt_for( if(n->base || (!n->el) || (!n->pchain) || (n->aidx != (uint32_t)-1)) continue; uw = (((uint32_t)(n->hid))<<1)|((uint32_t)(n->rev)); uw ^= 1; if(uw == w) { - // if((((v>>1) == 235) && ((w>>1) == 153)) || (((v>>1) == 153) && ((w>>1) == 235))) { - // fprintf(stderr, "-[M::%s] ul_id::%lu\n", __func__, a[k]>>32); - // // print_ul_alignment(ug, &UL_INF, a[k]>>32, "---"); + // if((((v>>1) == 15016) && ((w>>1) == 2778)) || (((v>>1) == 2778) && ((w>>1) == 15016))) { + // fprintf(stderr, "-[M::%s] ul_id::%lu, qs::%u, qe::%u\n", __func__, a[k]>>32, n->qs, n->qe); + // // print_ul_alignment(ug, &UL_INF, a[k]>>32, "+++"); // } ql = ((int64_t)p->qs) - ((int64_t)n->qe); if(ql > max_ql && n->qe < p->qe && n->qs < p->qs) { @@ -13976,13 +14118,13 @@ static void gen_scaffold_id(void *data, long i, int tid) // callback for kt_for( } if(m0 && m1 && id != ((uint32_t)-1)) { - // if((((v>>1) == 235) && ((w>>1) == 153)) || (((v>>1) == 153) && ((w>>1) == 235))) { - // fprintf(stderr, "[M::%s] id::%u, v>>1::%u(%c), w>>1::%u(%c)\n", - // __func__, id, v>>1, "+-"[v&1], w>>1, "+-"[w&1]); - // } z->ulid = (id<<1)|rev; z->raw_sof = m0->qe; z->raw_eof = m1->qs; + // if((((v>>1) == 15016) && ((w>>1) == 2778)) || (((v>>1) == 2778) && ((w>>1) == 15016))) { + // fprintf(stderr, "[M::%s] id::%u, v>>1::%u(%c), w>>1::%u(%c), raw_sof::%u, raw_eof::%u\n", + // __func__, id, v>>1, "+-"[v&1], w>>1, "+-"[w&1], z->raw_sof, z->raw_eof); + // } } } @@ -14069,7 +14211,7 @@ ma_ug_t *convert_usg_t(usg_t *ng, ma_ug_t *ref) return ug; } -int32_t gen_spec_rc_edge(asg_t *rg, ug_opt_t *uopt, uint32_t v, uint32_t w, asg_arc_t *t) +int32_t gen_spec_rc_edge(asg_t *rg, ug_opt_t *uopt, uint32_t v, uint32_t w, asg_arc_t *t, ma_hit_t **te) { uint32_t k, qn, tn; int32_t r; ma_hit_t_alloc *s = &(uopt->reverse_sources[v>>1]); asg_arc_t p; for (k = 0; k < s->length; k++) { @@ -14078,7 +14220,7 @@ int32_t gen_spec_rc_edge(asg_t *rg, ug_opt_t *uopt, uint32_t v, uint32_t w, asg_ r = ma_hit2arc(&(s->buffer[k]), rg->seq[qn].len, rg->seq[tn].len, uopt->max_hang, asm_opt.max_hang_rate, uopt->min_ovlp, &p); if(r < 0) continue; if((p.ul>>32) != v || p.v != w) continue; - *t = p; t->ou = 0; + *t = p; t->ou = 0; if(te) (*te) = &(s->buffer[k]); return 1; } return -1; @@ -14086,10 +14228,10 @@ int32_t gen_spec_rc_edge(asg_t *rg, ug_opt_t *uopt, uint32_t v, uint32_t w, asg_ void push_direct_scaff_node(usc_t *psa, asg_t *ng, ug_opt_t *uopt, uint64_t vx, uint64_t wx) { - uint64_t uol, dif; int32_t r; asg_arc_t t, *p; + uint64_t uol, dif; int32_t r; asg_arc_t t, *p; ma_hit_t *rc0, *rc1; uol = psa->raw_sof - psa->raw_eof; assert(uol >= (uint64_t)uopt->min_ovlp); - r = gen_spec_rc_edge(ng, uopt, vx, wx, &t); + r = gen_spec_rc_edge(ng, uopt, vx, wx, &t, &rc0); if(r >= 0) { if(uol > t.ol) dif = uol - t.ol; else dif = t.ol - uol; @@ -14097,11 +14239,17 @@ void push_direct_scaff_node(usc_t *psa, asg_t *ng, ug_opt_t *uopt, uint64_t vx, if(dif > 16) r = -1; } } - + // if((((vx>>1) == 3313901) && ((wx>>1) == 3555473)) || (((vx>>1) == 3555473) && ((wx>>1) == 3313901))) { + // fprintf(stderr, "[M::%s::]\t%.*s(%c)(sv>>1::%lu)\t%.*s(%c)(sw>>1::%lu)\tr::%d\n", __func__, + // (int)Get_NAME_LENGTH(R_INF, (vx>>1)), Get_NAME(R_INF, (vx>>1)), "+-"[vx&1], vx>>1, + // (int)Get_NAME_LENGTH(R_INF, (wx>>1)), Get_NAME(R_INF, (wx>>1)), "+-"[wx&1], wx>>1, r); + // } if(r >= 0) { p = asg_arc_pushp(ng); *p = t; - r = gen_spec_rc_edge(ng, uopt, wx^1, vx^1, &t); + r = gen_spec_rc_edge(ng, uopt, wx^1, vx^1, &t, &rc1); assert(r >= 0); p = asg_arc_pushp(ng); *p = t; + add_ma_hit_t_alloc(&(R_INF.paf[Get_qn((*rc0))]), rc0); + add_ma_hit_t_alloc(&(R_INF.paf[Get_qn((*rc1))]), rc1); } else { push_scaff_node(R_INF.paf, vx, wx, psa->raw_sof-psa->raw_eof, ng); @@ -14408,7 +14556,14 @@ void u2g_hybrid_detan_iter(ul_resolve_t *uidx, usg_t *ng, uint32_t max_ext, uint // ncut += ug_ext_strict(ob, ub, uidx, ng, max_ext, &ff, &ng_occ, &i_idx, &b64, &ub64); // prt_usg_t(uidx, ng, "ng.db"); if(ncut) { - usg_cleanup(ng); usg_arc_cut_tips(ng, max_ext, 1, ub); + usg_cleanup(ng); + // if((max_ext>>1) > 0) usg_arc_cut_tips(ng, (max_ext>>1), 1, ub); + usg_arc_cut_tips(ng, max_ext, 1, ub); + + // if(ng->n > 139131) { + // fprintf(stderr, ">[M::%s::] ng->a[139131].del::%u\n", __func__, + // ng->a[139131].del); + // } } // prt_usg_t(uidx, ng, "ng2"); // u2g_hybrid_aln(uidx, ng, ob, ub); @@ -16521,15 +16676,21 @@ ul_renew_t *ropt) if(sg->seq[i].del) continue; sg->seq[i].c = PRIMARY_LABLE; } + // fprintf(stderr, "0[M::%s]\n", __func__); hic_clean(sg); + // fprintf(stderr, "1[M::%s]\n", __func__); ma_ug_t *init_ug = ul_realignment(uopt, sg, 0); + // fprintf(stderr, "2[M::%s]\n", __func__); // exit(1); filter_sg_by_ug(sg, init_ug, uopt); + // fprintf(stderr, "3[M::%s]\n", __func__); // print_debug_gfa(sg, init_ug, uopt->coverage_cut, "UL.debug", uopt->sources, uopt->ruIndex, uopt->max_hang, uopt->min_ovlp, 0, 0, 0); // print_ul_alignment(init_ug, &UL_INF, 47072, "after-0"); bub = gen_bubble_chain(sg, init_ug, uopt, &r_het); + // fprintf(stderr, "4[M::%s]\n", __func__); // print_ul_alignment(init_ug, &UL_INF, 47072, "after-1"); ul_resolve_t *uidx = init_ul_resolve_t(sg, init_ug, bub, &UL_INF, uopt, r_het); + // fprintf(stderr, "5[M::%s]\n", __func__); // print_ul_alignment(init_ug, &UL_INF, 47072, "after-2"); // exit(1); // print_raw_uls_seq(uidx, asm_opt.output_file_name); @@ -16556,7 +16717,7 @@ ul_renew_t *ropt) asg_t *ng = renew_ng(uidx->uovl.h_usg, init_ug, sg, uopt, ropt->cov, ropt->ruIndex, 16); - destory_bubbles(uidx->bub); free(uidx->bub); ma_ug_destroy(init_ug); + destory_bubbles(uidx->bub); free(uidx->bub); ma_ug_destroy(init_ug); free(r_het); destroy_ul_resolve_t(uidx); destory_all_ul_t(&UL_INF); asg_destroy((*(ropt->sg))); (*(ropt->sg)) = ng; diff --git a/gfa_ut.h b/gfa_ut.h index 5823e21..c309526 100644 --- a/gfa_ut.h +++ b/gfa_ut.h @@ -3,6 +3,7 @@ #include "Overlaps.h" #include "hic.h" + typedef struct { asg_t *g; ma_hit_t_alloc *src; @@ -35,5 +36,6 @@ void post_rescue(ug_opt_t *uopt, asg_t *sg, ma_hit_t_alloc *src, ma_hit_t_alloc // void print_raw_u2rgfa_seq(all_ul_t *aln, R_to_U* rI, uint32_t is_detail); bubble_type *gen_bubble_chain(asg_t *sg, ma_ug_t *ug, ug_opt_t *uopt, uint8_t **ir_het); void filter_sg_by_ug(asg_t *rg, ma_ug_t *ug, ug_opt_t *uopt); +void ug_ext_gfa(ug_opt_t *uopt, asg_t *sg, uint32_t max_len); #endif diff --git a/hic.cpp b/hic.cpp index 36caaba..d60847c 100644 --- a/hic.cpp +++ b/hic.cpp @@ -2142,6 +2142,7 @@ void print_hits_simp(ha_ug_index* idx, kvec_pe_hit* hits) char dir[2] = {'+', '-'}; for (k = 0; k < hits->a.n; ++k) { + if(((hits->a.a[k].s<<1)>>shif) == ((hits->a.a[k].e<<1)>>shif)) continue; fprintf(stderr, "r-%lu-th\t%c\trs-utg%.6dl\t%lu\t%c\tre-utg%.6dl\t%lu\n", hits->a.a[k].id, dir[hits->a.a[k].s>>63], (int)((hits->a.a[k].s<<1)>>shif)+1, hits->a.a[k].s&idx->pos_mode, @@ -7088,7 +7089,7 @@ void get_bub_graph(ma_ug_t* ug, bubble_type* bub) adjecent = 0; while (pre_id != v) { - if(connect_bub_occ(bub, pre_id>>1, bub->check_het) > 0) + if(connect_bub_occ(bub, pre_id>>1, bub->check_het) > 0)///some bubbles between v and k { adjecent = 1; break; @@ -16480,7 +16481,7 @@ void optimize_u_trans(kv_u_trans_t *ovlp, kvec_pe_hit* hits, ha_ug_index* idx) for (i = 0; i < ovlp->n; i++){ x = &(ovlp->a[i]); if(x->qn > x->tn) continue; - if(x->f != RC_2 || x->del) continue; + if(x->f != RC_2 || x->del) continue;///no need to adjust RC_0/RC_1 occ = get_oe_occ(x->qn, x->tn, hits, idx) + get_oe_occ(x->tn, x->qn, hits, idx);///how many UL bridging qn and tn kv_pushp(u_trans_t, k_trans, &p); (*p) = (*x); p->nw = (x->nw*(1-(((double)(occ<<1))/((double)(hits->occ.a[x->qn]+hits->occ.a[x->tn]))))); @@ -16731,8 +16732,13 @@ int hic_short_align(const enzyme *fn1, const enzyme *fn2, ha_ug_index* idx, ug_o write_hc_hits(&sl.hits, idx->ug, asm_opt.output_file_name); } sl.hits.uID_bits = idx->uID_bits; sl.hits.pos_mode = idx->pos_mode; - optimize_u_trans(&(idx->t_ch->k_trans), &sl.hits, idx); - filter_kv_u_trans_t(&(idx->t_ch->k_trans), idx->ug, 0.5); + + /***debug***/ + if(sl.hits.idx.n == 0) idx_hc_links(&(sl.hits), idx, NULL); + // optimize_u_trans(&(idx->t_ch->k_trans), &sl.hits, idx); + // filter_kv_u_trans_t(&(idx->t_ch->k_trans), idx->ug, 0.5); + /***debug***/ + // print_kv_u_trans_t(&(idx->t_ch->k_trans), idx->ug); // flter_by_cov(idx, &sl.hits, 2); // update_hits(idx, &sl.hits, idx->t_ch->is_r_het); @@ -16740,7 +16746,7 @@ int hic_short_align(const enzyme *fn1, const enzyme *fn2, ha_ug_index* idx, ug_o ////dedup_hits(&(sl.hits), sl.idx); ///write_hc_hits_v14(&sl.hits, asm_opt.output_file_name); - if(asm_opt.misjoin_len > 0) + if(asm_opt.misjoin_len > 0/** && (!(asm_opt.ar))**/)//disable it for the UL assembly { update_switch_unitig(idx->ug, idx->read_g, &(sl.hits), &(idx->t_ch->k_trans), 10, 20, asm_opt.misjoin_len, 0.15); renew_idx_para(idx, idx->ug); @@ -16782,13 +16788,15 @@ int hic_short_align(const enzyme *fn1, const enzyme *fn2, ha_ug_index* idx, ug_o /*******************************for debug************************************/ mc_solve(NULL, NULL, &k_trans, idx->ug, idx->read_g, 0.8, R_INF.trio_flag, (bub.round_id == 0? 1 : 0), s->s, 1, &bub, - &(idx->t_ch->k_trans), 0, 0/**(((bub.round_id+1) == bub.n_round)?1:0)**/); + &(idx->t_ch->k_trans), 0, /**(((bub.round_id+1) == bub.n_round)?1:0)**/0); // mc_solve(NULL, NULL, &k_trans, idx->ug, idx->read_g, 0.8, R_INF.trio_flag, // (bub.round_id == 0? 1 : 0), s->s, 1, NULL, &(idx->t_ch->k_trans), 0, 0); - // if((bub.round_id+1) == bub.n_round) { - // prt_debug_hic(asm_opt.output_file_name, idx->ug, idx, opt, &sl.hits, - // &(idx->t_ch->k_trans), &k_trans, &link, s->s, &(bub)); - // } + /** + if((bub.round_id+1) == bub.n_round) { + prt_debug_hic(asm_opt.output_file_name, idx->ug, idx, opt, &sl.hits, + &(idx->t_ch->k_trans), &k_trans, &link, s->s, &(bub)); + } + **/ /*******************************for debug************************************/ label_unitigs_sm(s->s, NULL, idx->ug); @@ -16853,6 +16861,84 @@ int hic_short_align(const enzyme *fn1, const enzyme *fn2, ha_ug_index* idx, ug_o return 1; } +void bp_solve(ug_opt_t *opt, kv_u_trans_t *ref, ma_ug_t *ug, asg_t *sg, bubble_type *bub, double cis_rate) +{ + uint64_t k, i, v, nvx, n[3], nv; u_trans_t *z; + kv_u_trans_t in; memset(&in, 0, sizeof(in)); + ma_utg_t *u = NULL; asg_arc_t *av; double w; + ps_t *s = init_ps_t(11, ug->g->n_seq); + + for (i = 0; i < ug->g->n_seq; i++) { + s->s[i] = 0; + if((ug->g->seq[i].del) || (IF_HOM(i, (*bub)))) { + continue; + } + + u = &ug->u.a[i]; + if(u->m == 0 || u->n == 0) continue; + for (k = n[0] = n[1] = n[2] = 0; k < u->n; k++) { + if(R_INF.trio_flag[u->a[k]>>33] == FATHER) n[1]++; + else if(R_INF.trio_flag[u->a[k]>>33] == MOTHER) n[2]++; + else n[0]++; + } + if(n[1] >= n[2] && n[1] >= n[0]) s->s[i] = 1; + else if(n[2] >= n[1] && n[2] >= n[0]) s->s[i] = -1; + } + + kv_resize(u_trans_t, in, ref->n); + for (k = in.n = 0; k < ref->n; k++) { + if((IF_HOM(ref->a[k].qn, (*bub))) || (IF_HOM(ref->a[k].tn, (*bub))) || ref->a[k].del) { + continue; + } + kv_push(u_trans_t, in, ref->a[k]); + } + + if(cis_rate > 0) { + nvx = ug->g->n_seq<<1; + for (v = 0; v < nvx; v++) { + if((ug->g->seq[v>>1].del) || (IF_HOM((v>>1), (*bub)))) continue; + av = asg_arc_a(ug->g, v); nv = asg_arc_n(ug->g, v); + for (i = 0; i < nv; i++) { + if((av[i].del) || (av[i].v>>1) == (v>>1) || + (ug->g->seq[av[i].v>>1].del) || (IF_HOM((av[i].v>>1), (*bub)))) { + continue; + } + w = ((double)av[i].ol)*cis_rate; + if(w <= 0) continue; + kv_pushp(u_trans_t, in, &z); + memset(z, 0, sizeof((*z))); + z->f = RC_3; z->nw =-w; z->rev = (v^av[i].v)&1; + z->qn = v>>1; z->tn = av[i].v>>1; + + z->qs = 0; z->qe = sg->seq[z->qn].len; + if(av[i].ol < sg->seq[z->qn].len) { + if(v&1) { + z->qe = z->qs + av[i].ol; + } else { + z->qs = z->qe - av[i].ol; + } + } + + z->ts = 0; z->te = sg->seq[z->tn].len; + if(av[i].ol < sg->seq[z->tn].len) { + if(av[i].v&1) { + z->ts = z->te - av[i].ol; + } else { + z->te = z->ts + av[i].ol; + } + } + } + + } + } + // fprintf(stderr, "[M::%s::] ==> nup::%lu\n", __func__, nup); + clean_u_trans_t_idx_adv(&in, ug, sg); + + mc_solve(NULL, NULL, &in, ug, sg, 0.8, NULL, 0, s->s, 1, bub, ref, 0, 0); + label_unitigs_sm(s->s, NULL, ug); + destory_ps_t(&s); kv_destroy(in); kv_destroy(in.idx); +} + spg_t *hic_short_pre_align(const enzyme *fn1, const enzyme *fn2, ha_ug_index* idx, ug_opt_t *opt, kvec_pe_hit **rhits) { // double index_time = yak_realtime(); diff --git a/hic.h b/hic.h index a6bc80c..fc456e2 100644 --- a/hic.h +++ b/hic.h @@ -8,6 +8,7 @@ #define RC_0 0 #define RC_1 1 #define RC_2 2 +#define RC_3 3 hc_edge* get_hc_edge(hc_links* link, uint64_t src, uint64_t dest, uint64_t dir); hc_edge* push_hc_edge(hc_linkeage* x, uint64_t uID, double weight, int dir, uint64_t* d); @@ -111,5 +112,6 @@ void dedup_hits(kvec_pe_hit* hits, uint64_t is_dup); void hic_analysis(ma_ug_t *ug, asg_t* read_g, trans_chain* t_ch, ug_opt_t *opt, uint32_t is_poy, kvec_pe_hit **rhits); spg_t *hic_pre_analysis(ma_ug_t *ug, asg_t* read_g, trans_chain* t_ch, ug_opt_t *opt, kvec_pe_hit **rhits); void prt_bubble_gfa_adv(FILE *fp, bubble_type *bub, const char* utg_pre, const char* bub_pre, const char* chain_pre); +void bp_solve(ug_opt_t *opt, kv_u_trans_t *ref, ma_ug_t *ug, asg_t *sg, bubble_type *bub, double cis_rate); #endif diff --git a/horder.cpp b/horder.cpp index 7fe2e55..e3a9d34 100644 --- a/horder.cpp +++ b/horder.cpp @@ -960,7 +960,7 @@ int64_t cov_s_pos, int64_t cov_e_pos, uint64_t *s, uint64_t *e) (*s) = (*e) = (uint64_t)-1; res->n = 0; get_hic_cov_interval(b, b_n, cutoff, &cov_s_pos, &cov_e_pos, res); - if(res->n == 0) return; + if(res->n == 0) return;///res keeps all intervals with >= cutoff coverage int64_t max = -1, max_cur = 0; int64_t max_s_idx, max_e_idx, cur_s_idx; for (i = 0; i < res->n; i++)//all intervals have cov >= cutoff @@ -1842,7 +1842,7 @@ void break_phasing_utg(ma_ug_t *ug, asg_t *rg, kvec_pe_hit *hits, kv_u_trans_t * kv_pushp(h_cov_t, join, &p); p->dp = (b_points->a[l].s<<32)|(ug->u.n-1); p->s = rsi; p->e = ug->u.a[ug->u.n-1].len; - kv_push(uint64_t, dbg_N50, (uint64_t)(ug->u.a[ug->u.n-1].len)+((uint64_t)(1)<<63)); + kv_push(uint64_t, dbg_N50, (uint64_t)(ug->u.a[ug->u.n-1].len)+((uint64_t)(1)<<63));///new nodes } } pidx = idx; @@ -1858,13 +1858,13 @@ void break_phasing_utg(ma_ug_t *ug, asg_t *rg, kvec_pe_hit *hits, kv_u_trans_t * kv_pushp(h_cov_t, join, &p); p->dp = (b_points->a[l].s<<32)|(ug->u.n-1); p->s = rsi; p->e = ug->u.a[ug->u.n-1].len; - kv_push(uint64_t, dbg_N50, (uint64_t)(ug->u.a[ug->u.n-1].len)+((uint64_t)(1)<<63)); + kv_push(uint64_t, dbg_N50, (uint64_t)(ug->u.a[ug->u.n-1].len)+((uint64_t)(1)<<63));///new nodes } } if(de_u) { - kv_push(uint64_t, dbg_N50, ug->u.a[b_points->a[l].s].len); + kv_push(uint64_t, dbg_N50, ug->u.a[b_points->a[l].s].len);///old nodes update_nus(&(ug->u.a[b_points->a[l].s]), ug, join.a + pn, join.n - pn); free(ug->u.a[b_points->a[l].s].a); free(ug->u.a[b_points->a[l].s].s); memset(&(ug->u.a[b_points->a[l].s]), 0, sizeof(ug->u.a[b_points->a[l].s])); @@ -1890,7 +1890,7 @@ void break_phasing_utg(ma_ug_t *ug, asg_t *rg, kvec_pe_hit *hits, kv_u_trans_t * if(i < oug_n) { kv_pushp(h_cov_t, join, &p); - p->dp = (i<<32)|(m); + p->dp = (i<<32)|(m);///current id | updated id p->s = 0; p->e = ug->u.a[i].len; } @@ -1963,7 +1963,11 @@ void break_phasing_utg(ma_ug_t *ug, asg_t *rg, kvec_pe_hit *hits, kv_u_trans_t * free(k_trans->a); free(k_trans->idx.a); memset(k_trans, 0, sizeof(*k_trans)); k_trans->a = n_trans->a; k_trans->n = n_trans->n; k_trans->m = n_trans->m; free(n_trans); - clean_u_trans_t_idx(k_trans, ug, rg); + + clean_u_trans_t_idx_adv(k_trans, ug, rg); + // clean_u_trans_t_idx(k_trans, ug, rg); + + uint64_t uID_bits, pos_mode; for (uID_bits=1; (uint64_t)(1<u.n; uID_bits++); pos_mode = ((uint64_t)-1)>>(uID_bits+1); @@ -2069,6 +2073,8 @@ uint64_t min_ulen, double boundaryRate) // fprintf(stderr, "consensus_break-s: %lu, e: %lu\n", cov_buf.a[i].s, cov_buf.a[i].e); // } /*******************************for debug************************************/ + ///res -> all low coverage intervals; + ///cov_buf -> consensus coverage intervals; get_read_breaks(&(ug->u.a[get_hit_suid(*hits, l)]), rg, &cov_buf, &res, hits, l, k, ulen, &bs, &dp); if(bs == (uint64_t)-1) fprintf(stderr, "ERROR-read\n"); diff --git a/inter.cpp b/inter.cpp index 57af6af..e44d57a 100644 --- a/inter.cpp +++ b/inter.cpp @@ -16,6 +16,8 @@ #include "Correct.h" #include "Process_Read.h" #include "Assembly.h" +#include "hic.h" +#include "gfa_ut.h" KSEQ_INIT(gzFile, gzread) #define oreg_xe_lt(a, b) (((uint64_t)(a).x_pos_e<<32|(a).x_pos_s) < ((uint64_t)(b).x_pos_e<<32|(b).x_pos_s)) @@ -24,9 +26,10 @@ KSORT_INIT(or_xe, overlap_region, oreg_xe_lt) void ha_get_ul_candidates_interface(ha_abufl_t *ab, int64_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, overlap_region_alloc *overlap_list_hp, Candidates_list *cl, double bw_thres, int max_n_chain, int keep_whole_chain, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* chain_idx, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t high_occ, void *km); void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off); -void ug_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, uint32_t is_hpc); + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); +int64_t ug_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, double bw_thres_sec, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, + uint32_t gen_off, double mcopy_rate, uint32_t mcopy_khit_cut, uint32_t is_hpc, ha_mzl_t *res, uint64_t res_n, ha_mzl_t *idx, uint64_t idx_n, uint64_t mzl_cutoff, uint64_t chain_cutoff, kv_u_trans_t *kov); #define MG_SEED_IGNORE (1ULL<<41) #define MG_SEED_TANDEM (1ULL<<42) @@ -47,7 +50,7 @@ void ug_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t #define SEC_LEN_DIF 0.03 #define REA_ALIGN_CUTOFF 32 -#define CHUNK_SIZE 500000000 +#define CHUNK_SIZE 1000000000 #define generic_key(x) (x) KRADIX_SORT_INIT(gfa64, uint64_t, generic_key, 8) @@ -375,26 +378,27 @@ typedef struct { // data structure for each step in kt_pipeline() } utepdat_t; -typedef struct { // global data structure for kt_pipeline() - uint64_t x, i; -} ha_mzl_srt_t; - -typedef struct { size_t n, m; ha_mzl_srt_t *a; } ha_mzl_srt_v; +#define ha_mzl_t_key(p) ((p).x) +KRADIX_SORT_INIT(ha_mzl_t_srt, ha_mzl_t, ha_mzl_t_key, member_size(ha_mzl_t, x)) typedef struct { // global data structure for kt_pipeline() + ug_opt_t *uopt; ma_ug_t *ug; asg_t *rg; ha_ovec_buf_t **hab; glchain_t *ll; + bubble_type *bub; int32_t w, k; ///base-alignment - double bw_thres, diff_ec_ul; + double bw_thres, diff_ec_ul, sec_cutoff; + double bw_thres_double, diff_ec_ul_double; int32_t max_n_chain; ha_mzl_v idx_a; asg64_v idx_n; - ha_mzl_srt_v srt_a; - int32_t is_HPC, bw, max_gap, chn_pen_gap, n_thread, is_cnt; + ha_mzl_v srt_a; + int32_t is_HPC, bw, max_gap, chn_pen_gap, n_thread, is_cnt, is_ovlp, mini_cut, chain_cut, keep_unsymm_arc; ul_idx_t udb; + kv_u_trans_t *filter; } ug_trans_t; void hc_glchain_destroy(glchain_t *b) @@ -8957,7 +8961,7 @@ static void worker_for_ul_scall_alignment(void *data, long i, int tid) // callba // ha_get_ul_candidates_interface(b->abl, i, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->olist_hp, &b->clist, s->opt->bw_thres, // s->opt->max_n_chain, 1, NULL/**&(b->k_flag)**/, &b->r_buf, &(b->tmp_region), NULL, &(b->sp), asm_opt.hom_cov, km); ul_map_lchain(b->abl, (uint32_t)-1, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->clist, s->opt->bw_thres, - s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1); + s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2/**0.75**/, 2, 3); clear_Cigar_record(&b->cigar1); clear_Round2_alignment(&b->round2); @@ -9058,7 +9062,7 @@ static void worker_for_ul_rescall_alignment(void *data, long i, int tid) // call // ha_get_ul_candidates_interface(b->abl, i, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->olist_hp, &b->clist, s->opt->bw_thres, // s->opt->max_n_chain, 1, NULL, &b->r_buf, &(b->tmp_region), NULL, &(b->sp), 1, NULL); ul_map_lchain(b->abl, (uint32_t)-1, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->clist, s->opt->bw_thres, - s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1); + s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, /**0**/2, 1/**3**/); clear_Cigar_record(&b->cigar1); clear_Round2_alignment(&b->round2); @@ -9206,124 +9210,917 @@ uint32_t ck_ul_alignment(ul_vec_t *x) return 0; } +uint32_t filter_sec_trans_ovlp(ul_ov_t *a, uint32_t a_n, uint32_t qn, double sec_rate) +{ + if(a_n <= 0) return a_n; + uint32_t i, m, k, oz, op, ovlp, min_sc, max_sc; int64_t nw[2], ml, uml, msc; ul_ov_t *z, *p, t; + for (i = m = 0; i < a_n; i++) { + z = &(a[i]); z->qn = 0; + ml = z->qe - z->qs; uml = z->sec; ml -= uml; + nw[0] = (ml*CHAIN_MATCH) - (uml*CHAIN_UNMATCH); + ml = z->te - z->ts; uml = z->sec; ml -= uml; + nw[1] = (ml*CHAIN_MATCH) - (uml*CHAIN_UNMATCH); + msc = MIN(nw[0], nw[1]); + if(msc <= 0) continue; + if(msc >= UINT32_MAX) msc = UINT32_MAX; + z->qn = msc; + a[m++] = (*z); + } + a_n = m; + if(a_n <= 0) return a_n; + radix_sort_ul_ov_srt_qn(a, a + a_n);//sort by scores + for (i = 0, m = (a_n>>1); i < m; i++) { + t = a[i]; a[i] = a[a_n-i-1]; a[a_n-i-1] = t; + } + + for (k = m = 0; k < a_n; k++) { + z = &a[k]; oz = z->qe - z->qs; ///current chain + for (i = 0; i < m; i++) { + p = &(a[i]); + ovlp = ((MIN(z->qe, p->qe) > MAX(z->qs, p->qs))? (MIN(z->qe, p->qe) - MAX(z->qs, p->qs)):0); + if(ovlp == 0) continue; + op = p->qe - p->qs; + min_sc = MIN(p->qn, z->qn); max_sc = MAX(p->qn, z->qn); + if(min_sc <= (max_sc*0.95)) { + if((ovlp > (oz*sec_rate)) || (ovlp > (op*sec_rate))) break; + } + } + if(i < m) continue; + a[m++] = a[k]; + } + a_n = m; + for (k = 0; k < a_n; k++) a[k].qn = qn; + return a_n; +} + +uint32_t trans_ovlp_connect(ul_ov_t *p, ma_ug_t *ug) +{ + uint32_t v = p->qn<<1, w = (p->tn<<1) + ((uint32_t)p->rev), i, dg = (uint32_t)-1, da, dif, mm; + asg_arc_t *av = asg_arc_a(ug->g, v); uint32_t nv = asg_arc_n(ug->g, v); + for (i = 0; i < nv; i++) { + if(av[i].del || av[i].v != w) continue; + dg = av[i].ol; + break; + } + // if(i >= nv) return 1; + if(i < nv) { + da = (p->qe - p->qs); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; + + da = (p->te - p->ts); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; + } + + + v ^= 1; w ^= 1; dg = (uint32_t)-1; + av = asg_arc_a(ug->g, v); nv = asg_arc_n(ug->g, v); + for (i = 0; i < nv; i++) { + if(av[i].del || av[i].v != w) continue; + dg = av[i].ol; + break; + } + if(i < nv) { + da = (p->qe - p->qs); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; + + da = (p->te - p->ts); + dif = (dg>da? dg-da:da-dg); mm = MAX(dg, da); mm *= 0.06; if(mm < 8) mm = 8; + if(dif <= mm) return 0; + } + + return 1; +} + +uint64_t filter_by_reliable_ovlp0(u_trans_t *a, uint64_t a_n, uint64_t s, uint64_t e, double sec_rate) +{ + uint64_t k, ovlp; + for (k = 0; k < a_n; k++) { + if(a[k].del) continue; + if(a[k].f == RC_0 || a[k].f == RC_1) { + ovlp = ((MIN(a[k].qe, e) > MAX(a[k].qs, s))? (MIN(a[k].qe, e)-MAX(a[k].qs, s)):0); + if(ovlp == 0) continue; + if((ovlp) && (ovlp > ((e-s)*sec_rate))) return 1; + } + } + return 0; +} + +void filter_by_reliable_ovlp(uint32_t id, kv_u_trans_t *idx, st_mt_t *sp, overlap_region_alloc* ol, const ul_idx_t *udb, double sec_rate, uint32_t pre_filter) +{ + u_trans_t *a; uint64_t n, k, l, s, e, s0, e0, z, ov, rr; overlap_region *m, t; + a = u_trans_a(*idx, id); n = u_trans_n(*idx, id); + for (k = sp->n = 0; k < n; k++) { + if(a[k].del) continue; + if(a[k].f == RC_0 || a[k].f == RC_1) { + z = a[k].tn; z <<= 1; z |= a[k].rev; z <<= 32; z |= k; + kv_push(uint64_t, *sp, z); + // fprintf(stderr, "***utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\n", + // a[k].qn+1, "lc"[udb->ug->u.a[a[k].qn].circ], udb->ug->u.a[a[k].qn].len, a[k].qs, a[k].qe, "+-"[a[k].rev], + // a[k].tn+1, "lc"[udb->ug->u.a[a[k].tn].circ], udb->ug->u.a[a[k].tn].len, a[k].ts, a[k].te); + } + } + if(!sp->n) return; + for (k = 0; k < ol->length; k++) { + if((!pre_filter) || (ol->list[k].x_pos_strand)) {///corresponding to the overlaps within idx + z = ol->list[k].y_id; z <<= 1; z |= ol->list[k].y_pos_strand; + z <<= 32; z |= k; z |= ((uint64_t)0x80000000); + kv_push(uint64_t, *sp, z); + ol->list[k].x_pos_strand = 0; + } + } + radix_sort_gfa64(sp->a, sp->a + sp->n); + for (k = 1, l = 0; k <= sp->n; k++) { + if(k == sp->n || (sp->a[l]>>32)!=(sp->a[k]>>32)) { + if((k - l > 1) && (!(sp->a[l]&((uint64_t)0x80000000)))) { + for (z = l; z < k; z++) { + if(!(sp->a[z]&((uint64_t)0x80000000))) continue; + rr = (uint32_t)(sp->a[z]-((uint64_t)0x80000000)); + ol->list[rr].x_pos_strand = 1; + } + } + l = k; + } + } + + for (k = sp->n = 0; k < n; k++) { + if(a[k].del) continue; + if(a[k].f == RC_0 || a[k].f == RC_1) { + kv_push(uint64_t, *sp, (((uint64_t)a[k].qs)<<32)|((uint64_t)a[k].qe)); + } + } + if(sp->n > 1) { + radix_sort_gfa64(sp->a, sp->a + sp->n); + for (k = z = 0; k < sp->n; k++) { + s = sp->a[k]>>32; e = (uint32_t)sp->a[k]; + if(z > 0 && s <= ((uint32_t)sp->a[z-1])) { + if(e > ((uint32_t)sp->a[z-1])) { + sp->a[z-1] >>= 32; sp->a[z-1] <<= 32; sp->a[z-1] |= e; + } + } else { + sp->a[z++] = sp->a[k]; + } + } + sp->n = z; + } + + for (k = rr = 0; k < ol->length; k++) { + m = &(ol->list[k]); + if(m->x_pos_strand == 0) { + s = m->x_pos_s; e = m->x_pos_e + 1; l = 0; + for (z = 0; z < sp->n; z++) { + s0 = sp->a[z]>>32; e0 = (uint32_t)sp->a[z]; + ov = ((MIN(e, e0) > MAX(s, s0))? (MIN(e, e0) - MAX(s, s0)):0); + l += ov; + if((l) && (l > ((e-s)*sec_rate))) break; + } + if(z < sp->n) { + // fprintf(stderr, ">[M::%s] utg%.6u%c -> utg%.6u%c\n", __func__, id+1, "lc"[udb->ug->u.a[id].circ], + // m->y_id+1, "lc"[udb->ug->u.a[m->y_id].circ]); + continue;///filter by the overlaps of id/m->x_id + } + ///filter by the overlaps of m->y_id + if(filter_by_reliable_ovlp0(u_trans_a(*idx, m->y_id), u_trans_n(*idx, m->y_id), + m->y_pos_strand?udb->ug->u.a[m->y_id].len-m->y_pos_e-1:m->y_pos_s, + m->y_pos_strand?udb->ug->u.a[m->y_id].len-m->y_pos_s:m->y_pos_e+1, sec_rate)) { + // fprintf(stderr, "<[M::%s] utg%.6u%c -> utg%.6u%c\n", __func__, id+1, "lc"[udb->ug->u.a[id].circ], + // m->y_id+1, "lc"[udb->ug->u.a[m->y_id].circ]); + continue; + } + } + m->x_pos_strand = 0; + if(rr != k) { + t = ol->list[k]; + ol->list[k] = ol->list[rr]; + ol->list[rr] = t; + } + // fprintf(stderr, ">>>utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\n", + // ol->list[rr].x_id+1, "lc"[udb->ug->u.a[ol->list[rr].x_id].circ], udb->ug->u.a[ol->list[rr].x_id].len, + // ol->list[rr].x_pos_s, ol->list[rr].x_pos_e+1, "+-"[ol->list[rr].y_pos_strand], + // ol->list[rr].y_id+1, "lc"[udb->ug->u.a[ol->list[rr].y_id].circ], udb->ug->u.a[ol->list[rr].y_id].len, + // ol->list[rr].y_pos_s, ol->list[rr].y_pos_e+1); + rr++; + } + // if(id == 1576) { + // fprintf(stderr, "[M::%s] utg%.6ul, ol->length0::%lu, ol->length::%lu\n", __func__, id+1, ol->length, rr); + // } + ol->length = rr; +} static void worker_for_trans_ovlp(void *data, long i, int tid) // callback for kt_for() { - /** ug_trans_t *s = (ug_trans_t*)data; ha_ovec_buf_t *b = s->hab[tid]; - glchain_t *bl = &(s->ll[tid]); - int64_t winLen = MIN((((double)THRESHOLD_MAX_SIZE)/s->diff_ec_ul), WINDOW), ton = 0; - uint32_t high_occ = 2, phase = 1, k; - asg64_v b0, b1, b2; window_list p; memset(&p, 0, sizeof(p)); - overlap_region *aux_o = NULL; + kv_ul_ov_t *bl = &(s->ll[tid].tk); + int64_t winLen = MIN((((double)THRESHOLD_MAX_SIZE)/s->diff_ec_ul), WINDOW); + uint32_t high_occ = asm_opt.polyploidy + 1, k; uint64_t cnt, bn; + overlap_region *aux_o = NULL; ul_ov_t *p; char *seq = s->ug->u.a[i].s; int64_t len = s->ug->u.a[i].len; - **/ - // uint64_t align = 0; - - // if(UL_INF.a[s->id+i].rlen != s->len[i]) { - // fprintf(stderr, "[M::%s] rid:%ld, s->len:%lu, UL_INF->rlen:%u\n", __func__, s->id+i, s->len[i], UL_INF.a[s->id+i].rlen); - // } - // void *km = s->buf?(s->buf[tid]?s->buf[tid]->km:NULL):NULL; - // if(s->id+i!=3046) return; - // if((s->id+i!=871) && (s->id+i!=963) && (s->id+i!=980)) return; - // if(s->id+i!=944) return; - // if(s->id+i != 35437) return; - // if((s->id+i != 7086) && (s->id+i != 51705) && (s->id+i != 266022) && (s->id+i != 353608) - // && (s->id+i != 399416) && (s->id+i != 403014) && (s->id+i != 420915) && (s->id+i != 603855) - // && (s->id+i != 680134) && (s->id+i != 766261) && (s->id+i != 794527)) { - // return; - // } - // char *as = NULL; + if((!s->is_ovlp) && (s->is_cnt)) s->idx_n.a[i] = 0; + if(IF_HOM(i, (*(s->bub))) || s->ug->g->seq[i].del) return; + // asprintf(&as, "\n[M::%s] rid::%ld, len::%lu, name::%.*s\n", __func__, s->id+i, s->len[i], (int32_t)UL_INF.nid.a[s->id+i].n, UL_INF.nid.a[s->id+i].a); // push_vlog(&(overall_zdbg->a[s->id+i]), as); free(as); as = NULL; - // fprintf(stderr, "\n[M::%s] rid::%ld, len::%lu, name::%.*s\n", __func__, s->id+i, s->len[i], - // (int32_t)UL_INF.nid.a[s->id+i].n, UL_INF.nid.a[s->id+i].a); - // fprintf(stderr, ">%.*s\n%.*s\n", (int32_t)UL_INF.nid.a[s->id+i].n, UL_INF.nid.a[s->id+i].a, - // (int32_t)s->len[i], s->seq[i]); - // if (memcmp(UL_INF.nid.a[s->id+i].a, "d0aab024-b3a7-40fb-83cc-22c3d6d951f8", UL_INF.nid.a[s->id+i].n-1)) return; - // fprintf(stderr, "[M::%s::] ==> len: %lu\n", __func__, s->len[i]); - // ha_get_ul_candidates_interface(b->abl, i, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->olist_hp, &b->clist, s->opt->bw_thres, - // s->opt->max_n_chain, 1, NULL, &b->r_buf, &(b->tmp_region), NULL, &(b->sp), 1, NULL); - /** - ug_map_lchain(b->abl, (uint32_t)-1, seq, len, s->w, s->k, &(s->udb), NULL, NULL, s->bw_thres, - s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, s->is_HPC); - **/ - /** - clear_Cigar_record(&b->cigar1); - clear_Round2_alignment(&b->round2); - // return; - // b->num_correct_base += overlap_statistics(&b->olist, NULL, 0); + if(!s->is_ovlp) { + if(s->is_cnt) { + s->idx_n.a[i] = ug_map_lchain(b->abl, i, seq, len, s->w, s->k, &(s->udb), NULL, NULL, s->bw_thres, s->bw_thres_double, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, s->is_HPC, NULL, 0, NULL, 0, s->mini_cut, s->chain_cut, NULL); + // fprintf(stderr, "-1-[M::%s] rid::%ld, is_ovlp::%d, is_cnt::%d, len::%d, str::%u, s->idx_n.a[i]::%lu\n", + // __func__, i, s->is_ovlp, s->is_cnt, len, !!seq, s->idx_n.a[i]); + } else { + cnt = ug_map_lchain(b->abl, i, seq, len, s->w, s->k, &(s->udb), NULL, NULL, s->bw_thres, s->bw_thres_double, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, s->is_HPC, s->idx_a.a + s->idx_n.a[i], 0, NULL, 0, s->mini_cut, s->chain_cut, NULL); + assert(cnt == ((s->idx_n.a[i+1]-s->idx_n.a[i]))); + // if(i == 1576 || i == 2879) { + // fprintf(stderr, "-2-[M::%s] rid::%ld, is_ovlp::%d, is_cnt::%d, len::%ld, str::%u, cnt::%lu\n", + // __func__, i, s->is_ovlp, s->is_cnt, len, !!seq, cnt); + // } + } + return; + } else { + cnt = ((s->idx_n.a[i+1]-s->idx_n.a[i])); + // fprintf(stderr, "\n-0-[M::%s] utg%.6u%c, rid::%ld, is_ovlp::%d, is_cnt::%d, len::%ld, str::%u, cnt::%lu, diff::%f\n", + // __func__, (uint32_t)i+1, "lc"[s->ug->u.a[i].circ], i, s->is_ovlp, s->is_cnt, len, (uint32_t)(!!seq), cnt, s->diff_ec_ul); + ///note: high_occ is different + ug_map_lchain(b->abl, i, seq, len, s->w, s->k, &(s->udb), &b->olist, &b->clist, s->bw_thres, s->bw_thres_double, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, s->is_HPC, s->idx_a.a + s->idx_n.a[i], cnt, s->srt_a.a, s->srt_a.n, s->mini_cut, s->chain_cut, NULL); + } + + filter_by_reliable_ovlp(i, s->filter, &(b->sp), &b->olist, &(s->udb), s->sec_cutoff, 0); + // fprintf(stderr, "-0-[M::%s] rid::%ld, is_ovlp::%d, is_cnt::%d, len::%ld, str::%u\n", + // __func__, i, s->is_ovlp, s->is_cnt, len, !!seq); + clear_Cigar_record(&b->cigar1); clear_Round2_alignment(&b->round2); + + ug_lalign(&b->olist, &b->clist, &(s->udb), s->uopt, seq, len, &b->self_read, &b->ovlp_read, + &b->correct, &b->exz, aux_o, s->diff_ec_ul, winLen, i, s->k, s->chain_cut, NULL); + + aux_o = gen_aux_ovlp(&b->olist);///must be here - // int fully_cov, abnormal; - // b->self_read.seq = s->seq[i]; b->self_read.length = s->len[i]; b->self_read.size = 0; - // correct_ul_overlap(&b->olist, s->uu, &b->self_read, &b->correct, &b->ovlp_read, &b->POA_Graph, &b->DAGCon, - // &b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 1, &fully_cov, &abnormal, s->opt->diff_ec_ul, winLen, NULL); - // memset(&b->self_read, 0, sizeof(b->self_read)); + ug_lalign(&b->olist, &b->clist, &(s->udb), s->uopt, seq, len, &b->self_read, &b->ovlp_read, + &b->correct, &b->exz, aux_o, s->diff_ec_ul, winLen, i, s->k, s->chain_cut, NULL); + if(b->olist.length > 0) { + cnt = bl->n + b->olist.length; kv_resize(ul_ov_t, (*bl), cnt); + for (k = 0, bn = bl->n; k < b->olist.length; k++) { + // if(b->olist.list[k].non_homopolymer_errors == 0) continue;///exact match is not trans + kv_pushp(ul_ov_t, (*bl), &p); + p->qn = i; p->tn = b->olist.list[k].y_id; p->el = 1; + p->sec = b->olist.list[k].non_homopolymer_errors; + p->rev = b->olist.list[k].y_pos_strand; + p->qs = b->olist.list[k].x_pos_s; p->qe = b->olist.list[k].x_pos_e+1; + // p->ts = b->olist.list[k].y_pos_s; p->te = b->olist.list[k].y_pos_e+1; + if(p->rev) { + p->ts = s->udb.ug->u.a[p->tn].len - (b->olist.list[k].y_pos_e+1); + p->te = s->udb.ug->u.a[p->tn].len - b->olist.list[k].y_pos_s; + } else { + p->ts = b->olist.list[k].y_pos_s; + p->te = b->olist.list[k].y_pos_e+1; + } + // if(i == 5) + // { + // fprintf(stderr, "***utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\ti::%ld\n", + // p->qn+1, "lc"[s->ug->u.a[p->qn].circ], s->ug->u.a[p->qn].len, p->qs, p->qe, "+-"[p->rev], + // p->tn+1, "lc"[s->ug->u.a[p->tn].circ], s->ug->u.a[p->tn].len, p->ts, p->te, i); + // } + if(!trans_ovlp_connect(p, s->ug)) { + bl->n--; + } + // else { + // if(i == 5) + // { + // fprintf(stderr, ">>>utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\ti::%ld\n", + // p->qn+1, "lc"[s->ug->u.a[p->qn].circ], s->ug->u.a[p->qn].len, p->qs, p->qe, "+-"[p->rev], + // p->tn+1, "lc"[s->ug->u.a[p->tn].circ], s->ug->u.a[p->tn].len, p->ts, p->te, i); + // } + // } + } + cnt = filter_sec_trans_ovlp(bl->a+bn, bl->n-bn, i, s->sec_cutoff) + bn; + bl->n = cnt; + } - ul_lalign(&b->olist, &b->clist, s->uu, s->uopt, s->seq[i], s->len[i], &b->self_read, &b->ovlp_read, - &b->correct, &b->exz, &b->hap, &b->r_buf, aux_o, s->opt->diff_ec_ul, winLen, NULL, s->id+i, s->opt->k, &(s->sps[tid]), NULL); - // ul_lalign_old_ed(&b->olist, &b->clist, s->uu, s->seq[i], s->len[i], &b->self_read, &b->ovlp_read, - // &b->correct, &b->hap, &b->r_buf, s->opt->diff_ec_ul, winLen, 1, NULL); - ton = b->olist.length;//all alignments pass similary check - aux_o = gen_aux_ovlp(&b->olist);///must be here - gl_chain_flter(&b->olist, &b->correct, &(s->sps[tid]), bl, s->uu, s->opt->diff_ec_ul, winLen, s->len[i], s->uopt, &phase); - // fprintf(stderr, "[M::%s] rid::%ld, len::%lu, name::%.*s, phase::%u\n", __func__, s->id+i, s->len[i], - // (int32_t)UL_INF.nid.a[s->id+i].n, UL_INF.nid.a[s->id+i].a, phase); - if(phase && gen_shared_intervals(&b->olist, s->uu, s->uopt, winLen, &b->r_buf, &(bl->lo))) { - filter_topN(&b->olist, &(bl->lo), s->len[i], winLen, UL_TOPN, bl); - // update_shared_intervals(&b->olist, s->uu, s->uopt, NULL, &b->ovlp_read, &b->r_buf, &(s->sps[tid]), s->len[i], winLen, &(bl->lo), s->id+i); - - copy_asg_arr(b0, b->hap.snp_srt); copy_asg_arr(b1, s->sps[tid]); copy_asg_arr(b2, b->r_buf.a); - // update_shared_intervals(&b->olist, s->uu, s->uopt, NULL, &b->ovlp_read, &b0, &b1, &b2, s->len[i], winLen, &(bl->lo), s->id+i); - update_sketch_trace(&b->olist, s->uu, s->uopt, NULL, &b->ovlp_read, &b0, &b1, &b2, s->len[i], winLen, &(bl->lo), s->id+i, MAX_LGAP(s->len[i]), s->opt->diff_ec_ul); - copy_asg_arr(b->hap.snp_srt, b0); copy_asg_arr(s->sps[tid], b1); copy_asg_arr(b->r_buf.a, b2); + // fprintf(stderr, "[M::%s] rid:%ld, dd:%u\n", __func__, s->id+i, UL_INF.a[s->id+i].dd); + // int64_t mem[6], mem_hab[6]; + // if(get_utepdat_t_mem_tid(s, tid, mem, mem_hab)>((int64_t)5*(int64_t)1073741824)) { + // fprintf(stderr, "[M::%s::tid->%d::rid->%ld] buffer[0]: %.3fGB(%.3fGB::%.3fGB::%.3fGB::%.3fGB::%.3fGB), buffer[1]: %.3fGB, buffer[2]: %.3fGB, buffer[3]: %.3fGB, buffer[4]: %.3fGB, buffer[5]: %.3fGB\n", + // __func__, tid, i, mem[0]/1073741824.0, + // mem_hab[0]/1073741824.0, mem_hab[1]/1073741824.0, mem_hab[2]/1073741824.0, + // mem_hab[3]/1073741824.0, mem_hab[4]/1073741824.0, + // mem[1]/1073741824.0, mem[2]/1073741824.0, + // mem[3]/1073741824.0, mem[4]/1073741824.0, mem[5]/1073741824.0); + // } +} + + +void filter_by_reliable_ovlp_adv(uint32_t id, kv_u_trans_t *idx, st_mt_t *sp, overlap_region_alloc* ol, const ul_idx_t *udb, double sec_rate, uint64_t avoid_dup_aln, uint64_t *occ1) +{ + (*occ1) = 0; + u_trans_t *a; uint64_t n, k, l, s, e, s0, e0, z, ov, rr, r1, spn; overlap_region *m, t; + a = u_trans_a(*idx, id); n = u_trans_n(*idx, id); + if(avoid_dup_aln) { + kv_resize(uint64_t, *sp, (ol->length)+n); + for (k = sp->n = 0; k < n; k++) { + if(a[k].del) continue; + // if(id == 160) { + // fprintf(stderr, "[rid::%u]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\n", id, + // a[k].qn+1, "lc"[udb->ug->u.a[a[k].qn].circ], udb->ug->u.a[a[k].qn].len, a[k].qs, a[k].qe, "+-"[a[k].rev], + // a[k].tn+1, "lc"[udb->ug->u.a[a[k].tn].circ], udb->ug->u.a[a[k].tn].len, a[k].ts, a[k].te, a[k].nw, a[k].f); + // } + if(a[k].f == RC_0 || a[k].f == RC_1) { + z = a[k].tn; z <<= 1; z |= a[k].rev; z <<= 32; + kv_push(uint64_t, *sp, z); + } + } + if(sp->n > 0) { + for (k = 0; k < ol->length; k++) { + z = ol->list[k].y_id; z <<= 1; z |= ol->list[k].y_pos_strand; + z <<= 32; z |= k; z |= ((uint64_t)0x80000000); + kv_push(uint64_t, *sp, z); + } + + radix_sort_gfa64(sp->a, sp->a + sp->n); + for (k = 1, l = 0, rr = 0; k <= sp->n; k++) { + if(k == sp->n || (sp->a[l]>>32)!=(sp->a[k]>>32)) { + if((k - l > 1) && (!(sp->a[l]&((uint64_t)0x80000000)))) {///overlap within bck + for (z = l; z < k; z++) { + if(sp->a[z]&((uint64_t)0x80000000)) { + ol->list[(uint32_t)(sp->a[z]-((uint64_t)0x80000000))].y_id = ((uint32_t)-1); + rr++; + } + } + } + l = k; + } + } + + if(rr > 0) { + for (k = rr = 0; k < ol->length; k++) { + if(ol->list[k].y_id == ((uint32_t)-1)) continue; + if(rr != k) { + t = ol->list[rr]; + ol->list[rr] = ol->list[k]; + ol->list[k] = t; + } + rr++; + } + ol->length = rr; + } + } + } + + for (k = sp->n = 0; k < n; k++) { + if(a[k].del) continue; + if(a[k].f == RC_0 || a[k].f == RC_1) { + kv_push(uint64_t, *sp, (((uint64_t)a[k].qs)<<32)|((uint64_t)a[k].qe)); + } + } + if(sp->n > 1) { + radix_sort_gfa64(sp->a, sp->a + sp->n); + for (k = z = 0; k < sp->n; k++) { + s = sp->a[k]>>32; e = (uint32_t)sp->a[k]; + if(z > 0 && s <= ((uint32_t)sp->a[z-1])) { + if(e > ((uint32_t)sp->a[z-1])) { + sp->a[z-1] >>= 32; sp->a[z-1] <<= 32; sp->a[z-1] |= e; + } + } else { + sp->a[z++] = sp->a[k]; + } + } + sp->n = z; + } + + for (k = rr = r1 = 0, spn = sp->n; k < ol->length; k++) { + m = &(ol->list[k]); + // if(id == 160) { + // fprintf(stderr, "[M::%s] utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\txstr::%u\n", __func__, + // ol->list[k].x_id+1, "lc"[udb->ug->u.a[ol->list[k].x_id].circ], udb->ug->u.a[ol->list[k].x_id].len, + // ol->list[k].x_pos_s, ol->list[k].x_pos_e+1, "+-"[ol->list[k].y_pos_strand], + // ol->list[k].y_id+1, "lc"[udb->ug->u.a[ol->list[k].y_id].circ], udb->ug->u.a[ol->list[k].y_id].len, + // ol->list[k].y_pos_s, ol->list[k].y_pos_e+1, ol->list[k].x_pos_strand); + // } + if(m->x_pos_strand == 0) { + s = m->x_pos_s; e = m->x_pos_e + 1; l = 0; + for (z = 0; z < spn; z++) { + s0 = sp->a[z]>>32; e0 = (uint32_t)sp->a[z]; + ov = ((MIN(e, e0) > MAX(s, s0))? (MIN(e, e0) - MAX(s, s0)):0); + l += ov; + if((l) && (l > ((e-s)*sec_rate))) break; + } + ///filter by the overlaps of id/m->x_id + if(z < spn) continue; + ///filter by the overlaps of m->y_id + if(filter_by_reliable_ovlp0(u_trans_a(*idx, m->y_id), u_trans_n(*idx, m->y_id), + m->y_pos_strand?udb->ug->u.a[m->y_id].len-m->y_pos_e-1:m->y_pos_s, + m->y_pos_strand?udb->ug->u.a[m->y_id].len-m->y_pos_s:m->y_pos_e+1, sec_rate)) { + continue; + } + } + // else { + // if(id == 7) { + // fprintf(stderr, "[M::%s] utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\n", __func__, + // ol->list[k].x_id+1, "lc"[udb->ug->u.a[ol->list[k].x_id].circ], udb->ug->u.a[ol->list[k].x_id].len, + // ol->list[k].x_pos_s, ol->list[k].x_pos_e+1, "+-"[ol->list[k].y_pos_strand], + // ol->list[k].y_id+1, "lc"[udb->ug->u.a[ol->list[k].y_id].circ], udb->ug->u.a[ol->list[k].y_id].len, + // ol->list[k].y_pos_s, ol->list[k].y_pos_e+1); + // } + // } + if(rr != k) { + t = ol->list[k]; + ol->list[k] = ol->list[rr]; + ol->list[rr] = t; + } + if(ol->list[rr].x_pos_strand) { + ol->list[rr].x_pos_strand = 0; + if(r1 != rr) { + t = ol->list[r1]; + ol->list[r1] = ol->list[rr]; + ol->list[rr] = t; + } + r1++; + } + // fprintf(stderr, ">>>utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\n", + // ol->list[rr].x_id+1, "lc"[udb->ug->u.a[ol->list[rr].x_id].circ], udb->ug->u.a[ol->list[rr].x_id].len, + // ol->list[rr].x_pos_s, ol->list[rr].x_pos_e+1, "+-"[ol->list[rr].y_pos_strand], + // ol->list[rr].y_id+1, "lc"[udb->ug->u.a[ol->list[rr].y_id].circ], udb->ug->u.a[ol->list[rr].y_id].len, + // ol->list[rr].y_pos_s, ol->list[rr].y_pos_e+1); + rr++; + } + // if(id == 1576) { + // fprintf(stderr, "[M::%s] utg%.6ul, ol->length0::%lu, ol->length::%lu\n", __func__, id+1, ol->length, rr); + // } + ol->length = rr; (*occ1) = r1; +} + +void prt_split_ovs(overlap_region_alloc* ol, ma_ug_t *ug, uint64_t ol_h, uint64_t ol_l, const char* cmd) +{ + uint64_t k; overlap_region *z; + fprintf(stderr, "[M::%s] %s\tol->length::%lu, ol_h::%lu, ol_l::%lu\n", __func__, cmd, ol->length, ol_h, ol_l); + // assert((ol_h+ol_l) == ol->length); + for (k = 0; k < ol_h; k++) { + z = &(ol->list[k]); + fprintf(stderr, "oh->[%s]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\twn::%u\taln::%u\ted::%u\n", cmd, + z->x_id+1, "lc"[ug->u.a[z->x_id].circ], ug->u.a[z->x_id].len, + z->x_pos_s, z->x_pos_e+1, "+-"[z->y_pos_strand], + z->y_id+1, "lc"[ug->u.a[z->y_id].circ], ug->u.a[z->y_id].len, + z->y_pos_s, z->y_pos_e+1, (uint32_t)z->w_list.n, z->align_length, z->non_homopolymer_errors); + } + + for (; k < ol->length; k++) { + z = &(ol->list[k]); + fprintf(stderr, "ol->[%s]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\twn::%u\taln::%u\ted::%u\n", cmd, + z->x_id+1, "lc"[ug->u.a[z->x_id].circ], ug->u.a[z->x_id].len, + z->x_pos_s, z->x_pos_e+1, "+-"[z->y_pos_strand], + z->y_id+1, "lc"[ug->u.a[z->y_id].circ], ug->u.a[z->y_id].len, + z->y_pos_s, z->y_pos_e+1, (uint32_t)z->w_list.n, z->align_length, z->non_homopolymer_errors); + } +} + + +uint64_t split_ug_lalign(uint64_t ol_h, overlap_region_alloc* ol, double errh, double errl, + Candidates_list *cl, const ul_idx_t *uref, const ug_opt_t *uopt, + char *qstr, uint64_t ql, UC_Read* qu, UC_Read* tu, Correct_dumy* dumy, bit_extz_t *exz, + overlap_region *aux_o, int64_t sid, uint64_t khit, uint64_t chain_cut, void *km) +{ + uint64_t ol_l = ol->length - ol_h, on0, k, m; int64_t wl; double erate; overlap_region t; + // if(sid == 160) fprintf(stderr, "[M::%s] errh::%f, errl::%f\n", __func__, errh, errl); + // if(sid == 160) prt_split_ovs(ol, uref->ug, ol_h, ol_l, "st"); + if(ol_h) { + erate = errh; on0 = ol->length; + wl = MIN((((double)THRESHOLD_MAX_SIZE)/erate), WINDOW); + ol->length = ol_h; + ug_lalign(ol, cl, uref, uopt, qstr, ql, qu, tu, dumy, exz, aux_o, erate, wl, sid, khit, chain_cut, km); + for (k = ol_h, m = ol->length; k < on0; k++) {///move ovlps with ol_l + if(k != m) { + t = ol->list[k]; + ol->list[k] = ol->list[m]; + ol->list[m] = t; + } + m++; + } + ol_h = ol->length; ol->length = m; ol_l = ol->length - ol_h; + } + // if(sid == 160) prt_split_ovs(ol, uref->ug, ol_h, ol_l, "mi"); + if(ol_l) { + erate = errl; on0 = ol->length; + wl = MIN((((double)THRESHOLD_MAX_SIZE)/erate), WINDOW); + if(ol_h) {///swap ol_h and ol_l + for (k = ol_h, m = 0; k < ol->length; k++) { + if(k != m) { + t = ol->list[k]; + ol->list[k] = ol->list[m]; + ol->list[m] = t; + } + m++; + } + } + // if(sid == 7) prt_split_ovs(ol, uref->ug, ol_h, ol_l, "sw"); + ol->length = ol_l; + ug_lalign(ol, cl, uref, uopt, qstr, ql, qu, tu, dumy, exz, aux_o, erate, wl, sid, khit, chain_cut, km); + // if(sid == 7) { + // fprintf(stderr, "[M::%s::]\ton0::%lu\tol_l::%lu\tol_h::%lu\tol->length::%lu\n", __func__, + // on0, ol_l, ol_h, ol->length); + // prt_split_ovs(ol, uref->ug, ol_h, ol_l, "u0"); + // } + for (k = ol_l, m = ol->length; k < on0; k++) {///move ovlps with ol_h + if(k != m) { + t = ol->list[k]; + ol->list[k] = ol->list[m]; + ol->list[m] = t; + } + m++; + } + ol_l = ol->length; ol->length = m; ol_h = ol->length - ol_l; + // if(sid == 7) prt_split_ovs(ol, uref->ug, ol_h, ol_l, "u1"); + if(ol_l) {///swap ol_h and ol_l + for (k = ol_l, m = 0; k < ol->length; k++) { + if(k != m) { + t = ol->list[k]; + ol->list[k] = ol->list[m]; + ol->list[m] = t; + } + m++; + } + } + } + // if(sid == 160) prt_split_ovs(ol, uref->ug, ol_h, ol_l, "ed"); + return ol_h; +} + +uint32_t test_het_aln(ma_ug_t *ug, uint64_t rid, u_trans_t *a, uint64_t a_n, st_mt_t *sp, overlap_region_alloc* ol, uint64_t len) +{ + uint64_t k, s, e, z, l = 0; + for (k = sp->n = 0; k < a_n; k++) { + if(a[k].del) continue; + // if(rid == 56 || rid == 160) { + // fprintf(stderr, "[rid::%lu]\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\n", rid, + // a[k].qn+1, "lc"[ug->u.a[a[k].qn].circ], ug->u.a[a[k].qn].len, a[k].qs, a[k].qe, "+-"[a[k].rev], + // a[k].tn+1, "lc"[ug->u.a[a[k].tn].circ], ug->u.a[a[k].tn].len, a[k].ts, a[k].te, a[k].nw, a[k].f); + // } + kv_push(uint64_t, *sp, (((uint64_t)a[k].qs)<<32)|((uint64_t)a[k].qe)); + } + for (k = 0; k < ol->length; k++) { + // if(b->olist.list[k].non_homopolymer_errors == 0) continue;///exact match is not trans + s = ol->list[k].x_pos_s; e = ol->list[k].x_pos_e+1; + kv_push(uint64_t, *sp, (((uint64_t)s)<<32)|((uint64_t)e)); + } + + if(sp->n > 1) { + radix_sort_gfa64(sp->a, sp->a + sp->n); + for (k = z = 0; k < sp->n; k++) { + s = sp->a[k]>>32; e = (uint32_t)sp->a[k]; + if(z > 0 && s <= ((uint32_t)sp->a[z-1])) { + if(e > ((uint32_t)sp->a[z-1])) { + sp->a[z-1] >>= 32; sp->a[z-1] <<= 32; sp->a[z-1] |= e; + } + } else { + sp->a[z++] = sp->a[k]; + } + } + sp->n = z; + } + for (k = l = 0; k < sp->n; k++) l += (((uint32_t)sp->a[k]) - (sp->a[k]>>32)); + + // if(rid == 7) { + // fprintf(stderr, "\n[M::%s] utg%.6lu%c, rid::%lu, l::%lu, len::%lu, sp->n::%u\n", + // __func__, rid+1, "lc"[ug->u.a[rid].circ], rid, l, len, (uint32_t)sp->n); + // for (k = 0; k < a_n; k++) { + // if(a[k].del) continue; + // fprintf(stderr, "0\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\tf::%u\n", + // a[k].qn+1, "lc"[ug->u.a[a[k].qn].circ], ug->u.a[a[k].qn].len, + // a[k].qs, a[k].qe, "+-"[a[k].rev], + // a[k].tn+1, "lc"[ug->u.a[a[k].tn].circ], ug->u.a[a[k].tn].len, + // a[k].ts, a[k].te, a[k].nw, a[k].f); + // } + // for (k = 0; k < ol->length; k++) { + // overlap_region *z = &(ol->list[k]); + // fprintf(stderr, "1\tutg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\n", + // z->x_id+1, "lc"[ug->u.a[z->x_id].circ], ug->u.a[z->x_id].len, + // z->x_pos_s, z->x_pos_e+1, "+-"[z->y_pos_strand], + // z->y_id+1, "lc"[ug->u.a[z->y_id].circ], ug->u.a[z->y_id].len, + // z->y_pos_s, z->y_pos_e+1); + // } + // for (k = 0; k < sp->n; k++) { + // fprintf(stderr, "in::[%lu, %u)\n", (sp->a[k]>>32), ((uint32_t)sp->a[k])); + // } + // } + if((l > 0) && (l >= (len*0.8))) return 1; + // fprintf(stderr, "\n[M::%s] utg%.6lu%c, rid::%lu, l::%lu, len::%lu, sp->n::%u\n", + // __func__, rid+1, "lc"[ug->u.a[rid].circ], rid, l, len, (uint32_t)sp->n); + return 0; +} + +void push_ul_ov_t(ul_idx_t *udb, u_trans_t *a, uint64_t a_n, uint64_t rid, st_mt_t *sp, overlap_region_alloc* ol, uint64_t len, uint64_t is_arc_filter, double max_err, kv_ul_ov_t *res) +{ + uint64_t cnt, z, k, l, m, spn; ul_ov_t *p; + sp->n = 0; + if(a_n) { + for (k = sp->n = 0; k < a_n; k++) { + if(a[k].del) continue; + z = a[k].tn; z <<= 1; z |= a[k].rev; z <<= 32; + z |= k; z |= ((uint64_t)0x80000000); + kv_push(uint64_t, *sp, z); + } + for (k = 0; k < ol->length; k++) { + z = ol->list[k].y_id; z <<= 1; + z |= ol->list[k].y_pos_strand; + z <<= 32; z |= k; + kv_push(uint64_t, *sp, z); + } + + radix_sort_gfa64(sp->a, sp->a + sp->n); spn = sp->n; + for (k = 1, l = m = 0; k <= spn; k++) { + if(k == spn || (sp->a[l]>>32)!=(sp->a[k]>>32)) { + if((sp->a[l]&((uint64_t)0x80000000))) {///no overlap within ol + for (z = l; z < k; z++) { + sp->a[m++] = (uint32_t)(sp->a[z]-((uint64_t)0x80000000)); + } + } + l = k; + } + } + sp->n = m; + } + + if(ol->length > 0) { + cnt = res->n + ol->length; kv_resize(ul_ov_t, (*res), cnt); + for (k = 0/**, bn = res->n**/; k < ol->length; k++) { + // if(b->olist.list[k].non_homopolymer_errors == 0) continue;///exact match is not trans + kv_pushp(ul_ov_t, (*res), &p); + p->qn = rid; p->tn = ol->list[k].y_id; p->el = 1; + p->sec = ol->list[k].non_homopolymer_errors; + p->rev = ol->list[k].y_pos_strand; + p->qs = ol->list[k].x_pos_s; p->qe = ol->list[k].x_pos_e+1; + if(p->rev) { + p->ts = udb->ug->u.a[p->tn].len - (ol->list[k].y_pos_e+1); + p->te = udb->ug->u.a[p->tn].len - ol->list[k].y_pos_s; + } else { + p->ts = ol->list[k].y_pos_s; + p->te = ol->list[k].y_pos_e+1; + } + // fprintf(stderr, ">0<[M::%s] utg%.6u%c -> utg%.6u%c\n", __func__, + // p->qn+1, "lc"[s->udb.ug->u.a[p->qn].circ], + // p->tn+1, "lc"[s->udb.ug->u.a[p->tn].circ]); + // if(i == 5) + // { + // fprintf(stderr, "***utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\ti::%ld\n", + // p->qn+1, "lc"[s->ug->u.a[p->qn].circ], s->ug->u.a[p->qn].len, p->qs, p->qe, "+-"[p->rev], + // p->tn+1, "lc"[s->ug->u.a[p->tn].circ], s->ug->u.a[p->tn].len, p->ts, p->te, i); + // } + if((is_arc_filter) && (!trans_ovlp_connect(p, udb->ug))) res->n--; + // fprintf(stderr, ">1<[M::%s] utg%.6u%c -> utg%.6u%c\n", __func__, + // p->qn+1, "lc"[s->udb.ug->u.a[p->qn].circ], + // p->tn+1, "lc"[s->udb.ug->u.a[p->tn].circ]); + // else { + // if(i == 5) + // { + // fprintf(stderr, ">>>utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\n", + // p->qn+1, "lc"[udb->ug->u.a[p->qn].circ], udb->ug->u.a[p->qn].len, p->qs, p->qe, "+-"[p->rev], + // p->tn+1, "lc"[udb->ug->u.a[p->tn].circ], udb->ug->u.a[p->tn].len, p->ts, p->te); + // } + // } + } + + for (k = 0; k < sp->n; k++) { + kv_pushp(ul_ov_t, (*res), &p); + p->qn = a[sp->a[k]].qn; p->tn = a[sp->a[k]].tn; + p->rev = a[sp->a[k]].rev; p->el = 0; + p->qs = a[sp->a[k]].qs; p->qe = a[sp->a[k]].qe; + p->ts = a[sp->a[k]].ts; p->te = a[sp->a[k]].te; + p->sec = (p->qe-p->qs)*max_err; + } - ul_lalign(&b->olist, &b->clist, s->uu, s->uopt, s->seq[i], s->len[i], &b->self_read, &b->ovlp_read, - &b->correct, &b->exz, &b->hap, &b->r_buf, aux_o, s->opt->diff_ec_ul, winLen, &(bl->lo), s->id+i, s->opt->k, &(s->sps[tid]), NULL); - // ul_lalign_old_ed(&b->olist, &b->clist, s->uu, s->seq[i], s->len[i], &b->self_read, &b->ovlp_read, - // &b->correct, &b->hap, &b->r_buf, s->opt->diff_ec_ul, winLen, 0, NULL); - ///recover alignments - for (k = b->olist.length; k < ton; k++) { - b->olist.list[k].w_list.n = 0; - p.x_start = b->olist.list[k].x_pos_s; - p.x_end = b->olist.list[k].x_pos_e+1; - p.clen = b->olist.list[k].non_homopolymer_errors; - kv_push(window_list, b->olist.list[k].w_list, p); - b->olist.list[k].align_length = 0; - b->olist.list[k].overlapLen = b->olist.list[k].x_pos_e+1-b->olist.list[k].x_pos_s; + // if(is_sec_filter) { + // cnt = filter_sec_trans_ovlp(res->a+bn, res->n-bn, rid, sec_rate) + bn; + // res->n = cnt; + // } + // fprintf(stderr, "-1-[M::%s] utg%.6u%c, # ov::%lu\n", + // __func__, (uint32_t)i+1, "lc"[s->ug->u.a[i].circ], cnt - bn); + } +} + +void backward_dedup_ol(uint64_t rid, kv_ul_ov_t *bck, st_mt_t *sp, overlap_region_alloc* ol) +{ + int64_t i, m; uint64_t z, k, l; overlap_region t; + + kv_resize(uint64_t, *sp, ol->length); + for (i = ((int64_t)bck->n)-1, sp->n = 0; i >= 0 && bck->a[i].qn == rid; i--) { + z = bck->a[i].tn; z <<= 1; z |= bck->a[i].rev; z <<= 32; + kv_push(uint64_t, *sp, z); + } + if(!(sp->n)) return;///no bck overlap + + kv_resize(uint64_t, *sp, (ol->length)+sp->n); + for (k = 0; k < ol->length; k++) { + z = ol->list[k].y_id; z <<= 1; z |= ol->list[k].y_pos_strand; + z <<= 32; z |= k; z |= ((uint64_t)0x80000000); + kv_push(uint64_t, *sp, z); + } + + radix_sort_gfa64(sp->a, sp->a + sp->n); + for (k = 1, l = 0, m = 0; k <= sp->n; k++) { + if(k == sp->n || (sp->a[l]>>32)!=(sp->a[k]>>32)) { + if((k - l > 1) && (!(sp->a[l]&((uint64_t)0x80000000)))) {///overlap within bck + for (z = l; z < k; z++) { + if(sp->a[z]&((uint64_t)0x80000000)) { + ol->list[(uint32_t)(sp->a[z]-((uint64_t)0x80000000))].y_id = (uint32_t)-1; + m++; + } + } + } + l = k; } - b->olist.length = ton; - } else { - for (k = 0; k < b->olist.length; k++) { - b->olist.list[k].w_list.n = 0; - p.x_start = b->olist.list[k].x_pos_s; - p.x_end = b->olist.list[k].x_pos_e+1; - p.clen = 0; - kv_push(window_list, b->olist.list[k].w_list, p); - b->olist.list[k].align_length = b->olist.list[k].overlapLen = - b->olist.list[k].x_pos_e+1-b->olist.list[k].x_pos_s; - b->olist.list[k].non_homopolymer_errors = 0; + } + if(m > 0) { + for (k = z = 0; k < ol->length; k++) { + if(ol->list[k].y_id == (uint32_t)-1) continue; + if(z != k) { + t = ol->list[z]; + ol->list[z] = ol->list[k]; + ol->list[k] = t; + } + z++; } + ol->length = z; } - // prt_overlap_region_alloc_ol(&(b->olist), s->id+i); +} + +void remove_trans_ovlp_connect(ma_ug_t *ug, uint64_t rid, kv_ul_ov_t *bck) +{ + int64_t i; uint64_t m, k; + for (i = ((int64_t)bck->n)-1; i >= 0 && bck->a[i].qn == rid; i--); + m = i + 1; + if(m >= bck->n) return; + for (k = m; k < bck->n; k++) { + if(!trans_ovlp_connect(&(bck->a[k]), ug)) continue; + bck->a[m++] = bck->a[k]; + } + bck->n = m; +} + +uint64_t gen_trans_adaptive_aln(ug_trans_t *s, uint64_t rid, ha_ovec_buf_t *b, kv_ul_ov_t *bl, char *seq, uint64_t len, kv_u_trans_t *fi, + double err_low, double err_high, double bw_low, double bw_high) +{ + uint64_t cnt = ((s->idx_n.a[rid+1]-s->idx_n.a[rid])), ol_h = 0, pass_aln = 0; + uint32_t high_occ = asm_opt.polyploidy + 1; overlap_region *aux_o = NULL; + ///note: high_occ is different + ug_map_lchain(b->abl, rid, seq, len, s->w, s->k, &(s->udb), &b->olist, &b->clist, bw_low, bw_high, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, + s->is_HPC, s->idx_a.a + s->idx_n.a[rid], cnt, s->srt_a.a, s->srt_a.n, s->mini_cut, s->chain_cut, fi); + // if(rid == 160) { + // fprintf(stderr, "-1-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + ///remove candidate chains that have been calculated + if(!fi) backward_dedup_ol(rid, bl, &(b->sp), &b->olist); + // if(rid == 160) { + // fprintf(stderr, "-2-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + filter_by_reliable_ovlp_adv(rid, s->filter, &(b->sp), &b->olist, &(s->udb), s->sec_cutoff, 1, &ol_h); + clear_Cigar_record(&b->cigar1); clear_Round2_alignment(&b->round2); + if(!fi) ol_h = 0; + ol_h = split_ug_lalign(ol_h, &b->olist, err_high, err_low, + &b->clist, &(s->udb), s->uopt, seq, len, &b->self_read, &b->ovlp_read, + &b->correct, &b->exz, aux_o, rid, s->k, s->chain_cut, NULL); - gl_chain(s->buf[tid], &(UL_INF.a[s->id+i]), &b->olist, &(b->clist.chainDP), &b->hap, &(s->sps[tid]), bl, &(s->gdp[tid]), s->uu, s->opt->diff_ec_ul, winLen, s->len[i], s->uopt, s->id+i, tid, NULL); + aux_o = gen_aux_ovlp(&b->olist);///must be here - - if(UL_INF.a[s->id+i].dd == 1 || UL_INF.a[s->id+i].dd == 2) { - b->num_correct_base++; - } - if(UL_INF.a[s->id+i].dd != 3) { - free(s->seq[i]); s->seq[i] = NULL; + ol_h = split_ug_lalign(ol_h, &b->olist, err_high, err_low, + &b->clist, &(s->udb), s->uopt, seq, len, &b->self_read, &b->ovlp_read, + &b->correct, &b->exz, aux_o, rid, s->k, s->chain_cut, NULL); + + if(fi) {///first round + pass_aln = test_het_aln(s->ug, rid, u_trans_a((*(s->filter)), rid), u_trans_n((*(s->filter)), rid), &(b->sp), &b->olist, len); + push_ul_ov_t(&(s->udb), u_trans_a((*(s->filter)), rid), u_trans_n((*(s->filter)), rid), rid, &(b->sp), &b->olist, len, pass_aln, err_high, bl); + // fprintf(stderr, "-1-[M::%s] utg%.6lu%c, rid::%lu, pass_aln::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, pass_aln); + } else {///second round + push_ul_ov_t(&(s->udb), NULL, 0, rid, &(b->sp), &b->olist, len, 0, err_high, bl); + remove_trans_ovlp_connect(s->udb.ug, rid, bl); + } + return pass_aln; +} + +static void worker_for_trans_ovlp_adv(void *data, long i, int tid) // callback for kt_for() +{ + ug_trans_t *s = (ug_trans_t*)data; + ha_ovec_buf_t *b = s->hab[tid]; kv_ul_ov_t *bl = &(s->ll[tid].tk); + uint32_t high_occ = asm_opt.polyploidy + 1; uint64_t cnt; + char *seq = s->ug->u.a[i].s; int64_t len = s->ug->u.a[i].len; + if((!s->is_ovlp) && (s->is_cnt)) s->idx_n.a[i] = 0; + if(IF_HOM(i, (*(s->bub))) || s->ug->g->seq[i].del) return; + + // asprintf(&as, "\n[M::%s] rid::%ld, len::%lu, name::%.*s\n", __func__, s->id+i, s->len[i], (int32_t)UL_INF.nid.a[s->id+i].n, UL_INF.nid.a[s->id+i].a); + // push_vlog(&(overall_zdbg->a[s->id+i]), as); free(as); as = NULL; + + if(!s->is_ovlp) { + if(s->is_cnt) { + s->idx_n.a[i] = ug_map_lchain(b->abl, i, seq, len, s->w, s->k, &(s->udb), NULL, NULL, s->bw_thres, s->bw_thres_double, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, s->is_HPC, NULL, 0, NULL, 0, s->mini_cut, s->chain_cut, NULL); + } else { + cnt = ug_map_lchain(b->abl, i, seq, len, s->w, s->k, &(s->udb), NULL, NULL, s->bw_thres, s->bw_thres_double, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, s->is_HPC, s->idx_a.a + s->idx_n.a[i], 0, NULL, 0, s->mini_cut, s->chain_cut, NULL); + assert(cnt == ((s->idx_n.a[i+1]-s->idx_n.a[i]))); + } + return; + } + + // if(i == 160) { + // fprintf(stderr, "\n-1-[M::%s] utg%.6u%c, rid::%ld, is_ovlp::%d, is_cnt::%d, len::%ld, str::%u\n", + // __func__, (uint32_t)i+1, "lc"[s->ug->u.a[i].circ], i, s->is_ovlp, s->is_cnt, len, (uint32_t)(!!seq)); + // } + + if(!gen_trans_adaptive_aln(s, i, b, bl, seq, len, s->filter, s->diff_ec_ul, s->diff_ec_ul_double, s->bw_thres, s->bw_thres_double)) { + gen_trans_adaptive_aln(s, i, b, bl, seq, len, NULL, s->diff_ec_ul_double, s->diff_ec_ul_double, s->bw_thres_double, s->bw_thres_double); + } +} + +ul_ov_t* get_trans_ul_ov_t(ug_trans_t *s, uint32_t qn, uint32_t tn) +{ + ha_mzl_t *f = &(s->srt_a.a[qn]); + if(!(f->rev)) return NULL; + kv_ul_ov_t *bl = &(s->ll[(uint32_t)(f->x)].tk); + uint64_t k = f->rid; k <<= 32; k += f->pos; + // if(!(bl->a[k].qn == (f->x>>32))) { + // fprintf(stderr, "[M::%s::] k::%lu, qn::%u, tn::%u, x>>32::%lu, (uint32_t)x::%u\n", + // __func__, k, qn, tn, (f->x>>32), (uint32_t)(f->x)); + // } + // if(!(bl->a[k].qn == (f->x>>32))) { + // fprintf(stderr, "[M::%s::] k::%lu, bl->a[k].qn::%u, (f->x>>32)::%lu, (uint32_t)(f->x)::%u\n", + // __func__, k, bl->a[k].qn, (f->x>>32), (uint32_t)(f->x)); + // } + assert(bl->a[k].qn == (f->x>>32)); + for (; (k < bl->n) && (bl->a[k].qn == (f->x>>32)); k++) { + if(bl->a[k].qn == bl->a[k].tn) continue; + if(bl->a[k].tn == tn) return &(bl->a[k]); + } + return NULL; +} + +double inline cal_trans_ov_w(ul_ov_t *z) +{ + double nw[2], ml, uml; + ml = z->qe - z->qs; uml = z->sec; ml -= uml; + nw[0] = (ml*CHAIN_MATCH) - (uml*CHAIN_UNMATCH); + ml = z->te - z->ts; uml = z->sec; ml -= uml; + nw[1] = (ml*CHAIN_MATCH) - (uml*CHAIN_UNMATCH); + return MIN(nw[0], nw[1]); +} + +static void worker_for_sysm_trans_ovlp(void *data, long i, int tid) // callback for kt_for() +{ + ug_trans_t *s = (ug_trans_t*)data; + uint64_t k; ul_ov_t *z; double w0, w1; + ha_mzl_t *f = &(s->srt_a.a[i]); + if(!(f->rev)) return; + kv_ul_ov_t *bl = &(s->ll[(uint32_t)(f->x)].tk); + k = f->rid; k <<= 32; k += f->pos; + // if(!(bl->a[k].qn == (f->x>>32))) { + // fprintf(stderr, "[M::%s::] k::%lu, bl->a[k].qn::%u, (f->x>>32)::%lu, (uint32_t)(f->x)::%u\n", + // __func__, k, bl->a[k].qn, (f->x>>32), (uint32_t)(f->x)); + // } + assert(bl->a[k].qn == (f->x>>32)); + for (; (k < bl->n) && (bl->a[k].qn == (f->x>>32)); k++) { + if(bl->a[k].qn == bl->a[k].tn) { + bl->a[k].qs = bl->a[k].qe = (uint32_t)-1; + continue; + } + + w0 = cal_trans_ov_w(&(bl->a[k])); + z = get_trans_ul_ov_t(s, bl->a[k].tn, bl->a[k].qn); + if(!z) { + if((!(s->keep_unsymm_arc)) || (w0 <= 0)) { + bl->a[k].qs = bl->a[k].qe = (uint32_t)-1; + } + continue; + } + if(bl->a[k].qn > bl->a[k].tn) continue; + + w1 = cal_trans_ov_w(z); + if(w0 <= 0 && w1 <= 0) { + bl->a[k].qs = bl->a[k].qe = (uint32_t)-1; + z->qs = z->qe = (uint32_t)-1; + continue; + } + if(w0 > w1) { + // (*z) = bl->a[k];///cannot copy, this will affect tn&&qn + z->qs = bl->a[k].ts; z->qe = bl->a[k].te; + z->ts = bl->a[k].qs; z->te = bl->a[k].qe; + z->sec = bl->a[k].sec; z->el = bl->a[k].el; z->rev = bl->a[k].rev; + // z->qn = bl->a[k].tn; z->tn = bl->a[k].qn; + } else { + // bl->a[k] = (*z);///cannot copy, this will affect tn&&qn + bl->a[k].ts = z->qs; bl->a[k].te = z->qe; + bl->a[k].qs = z->ts; bl->a[k].qe = z->te; + bl->a[k].sec = z->sec; bl->a[k].el = z->el; bl->a[k].rev = z->rev; + // bl->a[k].tn = z->qn; bl->a[k].qn = z->tn; + } } - s->hab[tid]->num_read_base++; - **/ } @@ -9652,7 +10449,7 @@ static void *worker_ul_scall_pipeline(void *data, int step, void *in) // callbac } // fprintf(stderr, "[M::%s::Start] ==> s->id: %lu, s->n:% d\n", __func__, s->id, s->n); kt_for(p->n_thread, worker_for_ul_scall_alignment, s, s->n); - // fprintf(stderr, "[M::%s::Done] ==> s->id: %lu, s->n:% d\n", __func__, s->id, s->n); + fprintf(stderr, "[M::%s::Done] ==> s->id: %lu, s->n:% d\n", __func__, s->id, s->n); ///debug /** uint64_t i; @@ -9783,7 +10580,7 @@ static void *worker_ul_rescall_pipeline(void *data, int step, void *in) // callb } // fprintf(stderr, "[M::%s::Start] ==> s->id: %lu, s->n:% d\n", __func__, s->id, s->n); kt_for(p->n_thread, worker_for_ul_rescall_alignment, s, s->n); - // fprintf(stderr, "[M::%s::Done] ==> s->id: %lu, s->n:% d\n", __func__, s->id, s->n); + fprintf(stderr, "[M::%s::Done] ==> s->id: %lu, s->n:% d\n", __func__, s->id, s->n); // get_utepdat_t_mem(s, 1); for (i = 0; i < p->n_thread; ++i) { @@ -15034,17 +15831,25 @@ void clear_all_ul_t(all_ul_t *x) free(x->ridx.idx.a); free(x->ridx.occ.a); memset(&(x->ridx), 0, sizeof((x->ridx))); } -void init_ug_trans_t(ug_trans_t *opt, int32_t is_HPC, int32_t k, int32_t w, int32_t max_n_chain, -double bw_thres, double diff_ec_ul, int32_t n_thread, ma_ug_t *ug, asg_t *sg) +void init_ug_trans_t(ug_trans_t *opt, ug_opt_t *uopt, int32_t is_HPC, int32_t k, int32_t w, int32_t max_n_chain, +double bw_thres, double diff_ec_ul, double bw_thres_double, double diff_ec_ul_double, double sec_cutoff, int32_t n_thread, +int32_t mini_cut, int32_t chain_cut, int32_t keep_unsymm_arc, ma_ug_t *ug, asg_t *sg, bubble_type *bub) { - int64_t i; + int64_t i; uint8_t *bf = NULL; memset(opt, 0, sizeof((*opt))); + opt->uopt = uopt; opt->k = k; opt->w = w; opt->is_HPC = is_HPC; opt->max_n_chain = max_n_chain; opt->bw_thres = bw_thres; + opt->bw_thres_double = bw_thres_double; opt->diff_ec_ul = diff_ec_ul; + opt->diff_ec_ul_double = diff_ec_ul_double; + opt->sec_cutoff = sec_cutoff; + opt->mini_cut = mini_cut; + opt->chain_cut = chain_cut; + opt->keep_unsymm_arc = keep_unsymm_arc; opt->bw = 10000;///2000 in minigraph opt->max_gap = 500000;///5000 in minigraph @@ -15058,24 +15863,263 @@ double bw_thres, double diff_ec_ul, int32_t n_thread, ma_ug_t *ug, asg_t *sg) opt->hab[i] = ha_ovec_init(0, 0, 1); } - opt->idx_n.n = opt->idx_n.m = ug->u.n; + opt->idx_n.n = opt->idx_n.m = ug->u.n+1; MALLOC(opt->idx_n.a, opt->idx_n.n); opt->udb.ug = ug; + if(bub) { + opt->bub = bub; + } else { + opt->bub = gen_bubble_chain(sg, ug, uopt, &bf); free(bf); + } } -void trans_base_count(ug_trans_t *p) +void gen_trans_base_count(ug_trans_t *p, kv_u_trans_t *res) { - p->is_cnt = 1; ha_flt_tab = NULL; + double index_time = yak_realtime(); + // ha_flt_tab = NULL; + uint64_t i, k, l, occ, m, cc; kv_ul_ov_t *bl = NULL; + u_trans_t *z; ha_mzl_t *tz; + clean_u_trans_t_idx_adv(res, p->ug, p->rg); p->filter = res; + fprintf(stderr, "[M::%s::] ==> 0\n", __func__); + p->is_cnt = 1; p->is_ovlp = 0; + kt_for(p->n_thread, worker_for_trans_ovlp, p, p->ug->u.n); + for (i = l = 0; i < p->ug->u.n; i++) { + occ = p->idx_n.a[i]; p->idx_n.a[i] = l; l += occ; + } + fprintf(stderr, "[M::%s::] i::%lu, l::%lu\n", __func__, i, l); + p->idx_n.a[i] = l; + p->idx_a.n = p->idx_a.m = l; MALLOC(p->idx_a.a, p->idx_a.n); + fprintf(stderr, "[M::%s::] ==> 1\n", __func__); + p->is_cnt = 0; p->is_ovlp = 0; kt_for(p->n_thread, worker_for_trans_ovlp, p, p->ug->u.n); + p->srt_a.n = p->srt_a.m = p->idx_a.n; MALLOC(p->srt_a.a, p->srt_a.n); + fprintf(stderr, "[M::%s::] p->idx_a.n::%lu \n", __func__, (uint64_t)p->idx_a.n); + // memcpy(p->srt_a.a, p->idx_a.a, p->srt_a.n*sizeof((*(p->srt_a.a)))); + for (i = 0; i < p->srt_a.n; i++) { + p->srt_a.a[i] = p->idx_a.a[i]; + p->srt_a.a[i].pos = (uint32_t)i; + p->srt_a.a[i].rid = i>>32; + } + radix_sort_ha_mzl_t_srt(p->srt_a.a, p->srt_a.a + p->srt_a.n); + kvec_t(uint64_t) cut; kv_init(cut); + for (k = 1, l = 0; k <= p->srt_a.n; k++) { + if(k == p->srt_a.n || p->srt_a.a[l].x != p->srt_a.a[k].x) { + for (i = l; i < k; i++) { + m = p->srt_a.a[i].rid; m <<= 32; m |= p->srt_a.a[i].pos; + assert(p->srt_a.a[i].x == p->idx_a.a[m].x); + p->srt_a.a[i] = p->idx_a.a[m]; p->idx_a.a[m].x = i; + } + kv_push(uint64_t, cut, (k - l)); + l = k; + } + } + + if(cut.n > 0) { + radix_sort_gfa64(cut.a, cut.a + cut.n); + m = cut.n * 0.0002; cc = cut.a[cut.n-1] + 1; + if(m > 0 && m <= cut.n) cc = cut.a[cut.n-m] + 1; + if(cc < (uint64_t)p->mini_cut) p->mini_cut = cc; + } + kv_destroy(cut); + fprintf(stderr, "[M::%s::] p->mini_cut::%d \n", __func__, p->mini_cut); + fprintf(stderr, "[M::%s::] ==> 2\n", __func__); + p->is_cnt = 0; p->is_ovlp = 1; + kt_for(p->n_thread, worker_for_trans_ovlp, p, p->ug->u.n); + fprintf(stderr, "[M::%s::] ==> 3\n", __func__); + for (i = 0; (int64_t)i < p->n_thread; i++) { + ha_ovec_destroy(p->hab[i]); + free(p->ll[i].lo.a); free(p->ll[i].srt.a.a); free(p->ll[i].tc.a); + } + free(p->idx_a.a); free(p->idx_n.a); free(p->hab); + // destory_bubbles(p->bub); free(p->bub); + + fprintf(stderr, "[M::%s::] ==> 4\n", __func__); + ///make results consistent + kv_resize(ha_mzl_t, p->srt_a, p->ug->u.n); p->srt_a.n = p->ug->u.n; + for (i = 0; i < p->srt_a.n; i++) { + tz = &(p->srt_a.a[i]); + tz->x = (uint64_t)-1; tz->rev = 0; + tz->pos = tz->rid = tz->span = 0; + } + // memset(p->srt_a.a, 0, sizeof((*(p->srt_a.a)))*p->srt_a.n); + for (i = 0, occ = res->n; (int64_t)i < p->n_thread; i++) { + bl = &(p->ll[i].tk); + if(!(bl->n)) continue; + for (k = 1, l = 0; k <= bl->n; k++) { + if(k == bl->n || bl->a[k].qn != bl->a[l].qn) { + if(k > l) { + tz = &(p->srt_a.a[bl->a[l].qn]); + tz->x = bl->a[l].qn; tz->x <<= 32; tz->x |= i; + tz->rid = l>>32; tz->pos = (uint32_t)l; tz->rev = 1; + occ += (k - l); + } + l = k; + } + } + } + fprintf(stderr, "[M::%s::] ==> 5\n", __func__); + kt_for(p->n_thread, worker_for_sysm_trans_ovlp, p, p->ug->u.n); + // assert(p->srt_a.n <= p->ug->u.n); + // radix_sort_ha_mzl_t_srt(p->srt_a.a, p->srt_a.a + p->srt_a.n); + kv_resize(u_trans_t, *res, occ); + for (i = 0; i < p->srt_a.n; i++) { + tz = &(p->srt_a.a[i]); + if(!(tz->rev)) continue; + bl = &(p->ll[(uint32_t)(tz->x)].tk); + k = tz->rid; k <<= 32; k += tz->pos; + assert(bl->a[k].qn == (tz->x>>32)); + for (; (k < bl->n) && (bl->a[k].qn == (tz->x>>32)); k++) { + if(bl->a[k].qn == bl->a[k].tn) continue; + if(bl->a[k].qs == (uint32_t)-1 && bl->a[k].qe == (uint32_t)-1) continue; + + kv_pushp(u_trans_t, *res, &z); + z->f = RC_3; z->rev = bl->a[k].rev; z->del = 0; + z->qn = bl->a[k].qn; z->qs = bl->a[k].qs; z->qe = bl->a[k].qe; + z->tn = bl->a[k].tn; z->ts = bl->a[k].ts; z->te = bl->a[k].te; + z->nw = cal_trans_ov_w(&(bl->a[k])); assert(z->nw > 0); + // if(z->nw <= 0) res->n--; + } + } + fprintf(stderr, "[M::%s::] ==> 6\n", __func__); + for (i = 0; (int64_t)i < p->n_thread; i++) free(p->ll[i].tk.a); + free(p->srt_a.a); free(p->ll); + fprintf(stderr, "[M::%s::%.3f] ==> Qualification\n", __func__, yak_realtime()-index_time); +} + + +void gen_trans_base_count_comp(ug_trans_t *p, kv_u_trans_t *res) +{ + double index_time = yak_realtime(); + // ha_flt_tab = NULL; + uint64_t i, k, l, occ, m, cc; kv_ul_ov_t *bl = NULL; + u_trans_t *z; ha_mzl_t *tz; + clean_u_trans_t_idx_adv(res, p->ug, p->rg); p->filter = res; + // fprintf(stderr, "[M::%s::] ==> 0\n", __func__); + p->is_cnt = 1; p->is_ovlp = 0; + kt_for(p->n_thread, worker_for_trans_ovlp_adv, p, p->ug->u.n); + for (i = l = 0; i < p->ug->u.n; i++) { + occ = p->idx_n.a[i]; p->idx_n.a[i] = l; l += occ; + } + // fprintf(stderr, "[M::%s::] i::%lu, l::%lu\n", __func__, i, l); + p->idx_n.a[i] = l; + p->idx_a.n = p->idx_a.m = l; MALLOC(p->idx_a.a, p->idx_a.n); + // fprintf(stderr, "[M::%s::] ==> 1\n", __func__); + p->is_cnt = 0; p->is_ovlp = 0; + kt_for(p->n_thread, worker_for_trans_ovlp_adv, p, p->ug->u.n); + p->srt_a.n = p->srt_a.m = p->idx_a.n; MALLOC(p->srt_a.a, p->srt_a.n); + // fprintf(stderr, "[M::%s::] p->idx_a.n::%lu \n", __func__, (uint64_t)p->idx_a.n); + // memcpy(p->srt_a.a, p->idx_a.a, p->srt_a.n*sizeof((*(p->srt_a.a)))); + for (i = 0; i < p->srt_a.n; i++) { + p->srt_a.a[i] = p->idx_a.a[i]; + p->srt_a.a[i].pos = (uint32_t)i; + p->srt_a.a[i].rid = i>>32; + } + radix_sort_ha_mzl_t_srt(p->srt_a.a, p->srt_a.a + p->srt_a.n); + kvec_t(uint64_t) cut; kv_init(cut); + for (k = 1, l = 0; k <= p->srt_a.n; k++) { + if(k == p->srt_a.n || p->srt_a.a[l].x != p->srt_a.a[k].x) { + for (i = l; i < k; i++) { + m = p->srt_a.a[i].rid; m <<= 32; m |= p->srt_a.a[i].pos; + assert(p->srt_a.a[i].x == p->idx_a.a[m].x); + p->srt_a.a[i] = p->idx_a.a[m]; p->idx_a.a[m].x = i; + } + kv_push(uint64_t, cut, (k - l)); + l = k; + } + } + + if(cut.n > 0) { + radix_sort_gfa64(cut.a, cut.a + cut.n); + m = cut.n * 0.0002; cc = cut.a[cut.n-1] + 1; + if(m > 0 && m <= cut.n) cc = cut.a[cut.n-m] + 1; + if(cc < (uint64_t)p->mini_cut) p->mini_cut = cc; + } + kv_destroy(cut); + // fprintf(stderr, "[M::%s::] p->mini_cut::%d \n", __func__, p->mini_cut); + + // fprintf(stderr, "[M::%s::] ==> 2\n", __func__); + p->is_cnt = 0; p->is_ovlp = 1; + kt_for(p->n_thread, worker_for_trans_ovlp_adv, p, p->ug->u.n); + // fprintf(stderr, "[M::%s::] ==> 3\n", __func__); + for (i = 0; (int64_t)i < p->n_thread; i++) { + ha_ovec_destroy(p->hab[i]); + free(p->ll[i].lo.a); free(p->ll[i].srt.a.a); free(p->ll[i].tc.a); + } + free(p->idx_a.a); free(p->idx_n.a); free(p->hab); + // destory_bubbles(p->bub); free(p->bub); + + // fprintf(stderr, "[M::%s::] ==> 4\n", __func__); + ///make results consistent + kv_resize(ha_mzl_t, p->srt_a, p->ug->u.n); p->srt_a.n = p->ug->u.n; + for (i = 0; i < p->srt_a.n; i++) { + tz = &(p->srt_a.a[i]); + tz->x = (uint64_t)-1; tz->rev = 0; + tz->pos = tz->rid = tz->span = 0; + } + // memset(p->srt_a.a, 0, sizeof((*(p->srt_a.a)))*p->srt_a.n); + for (i = 0, occ = res->n; (int64_t)i < p->n_thread; i++) { + bl = &(p->ll[i].tk); + if(!(bl->n)) continue; + for (k = 1, l = 0; k <= bl->n; k++) { + if(k == bl->n || bl->a[k].qn != bl->a[l].qn) { + if(k > l) { + tz = &(p->srt_a.a[bl->a[l].qn]); + tz->x = bl->a[l].qn; tz->x <<= 32; tz->x |= i; + tz->rid = l>>32; tz->pos = (uint32_t)l; tz->rev = 1; + occ += (k - l); + } + l = k; + } + } + } + // fprintf(stderr, "[M::%s::] ==> 5\n", __func__); + kt_for(p->n_thread, worker_for_sysm_trans_ovlp, p, p->ug->u.n); + // assert(p->srt_a.n <= p->ug->u.n); + // radix_sort_ha_mzl_t_srt(p->srt_a.a, p->srt_a.a + p->srt_a.n); + kv_resize(u_trans_t, *res, occ); + for (i = 0; i < p->srt_a.n; i++) { + tz = &(p->srt_a.a[i]); + if(!(tz->rev)) continue; + bl = &(p->ll[(uint32_t)(tz->x)].tk); + k = tz->rid; k <<= 32; k += tz->pos; + assert(bl->a[k].qn == (tz->x>>32)); + for (; (k < bl->n) && (bl->a[k].qn == (tz->x>>32)); k++) { + if(bl->a[k].qn == bl->a[k].tn) continue; + if(bl->a[k].qs == (uint32_t)-1 && bl->a[k].qe == (uint32_t)-1) continue; + + kv_pushp(u_trans_t, *res, &z); + z->f = RC_3; z->rev = bl->a[k].rev; z->del = 0; + z->qn = bl->a[k].qn; z->qs = bl->a[k].qs; z->qe = bl->a[k].qe; + z->tn = bl->a[k].tn; z->ts = bl->a[k].ts; z->te = bl->a[k].te; + z->nw = cal_trans_ov_w(&(bl->a[k])); assert(z->nw > 0); + // if(z->qn == 56 || z->qn == 160 || z->tn == 56 || z->tn == 160) { + // fprintf(stderr, ">>>utg%.6u%c\t%u\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\t%u\tnw::%f\n", + // z->qn+1, "lc"[p->ug->u.a[z->qn].circ], p->ug->u.a[z->qn].len, z->qs, z->qe, "+-"[z->rev], + // z->tn+1, "lc"[p->ug->u.a[z->tn].circ], p->ug->u.a[z->tn].len, z->ts, z->te, z->nw); + // } + // if(z->nw <= 0) res->n--; + } + } + // fprintf(stderr, "[M::%s::] ==> 6\n", __func__); + for (i = 0; (int64_t)i < p->n_thread; i++) free(p->ll[i].tk.a); + free(p->srt_a.a); free(p->ll); + fprintf(stderr, "[M::%s::%.3f] ==> Qualification\n", __func__, yak_realtime()-index_time); } -void trans_base_infer(ma_ug_t *ug, asg_t *sg) +void trans_base_infer(ma_ug_t *ug, asg_t *sg, ug_opt_t *uopt, kv_u_trans_t *res, bubble_type *bub) { ug_trans_t sl; init_aux_table(); ha_opt_update_cov(&asm_opt, asm_opt.hom_cov); - init_ug_trans_t(&sl, 0, asm_opt.trans_mer_length, asm_opt.trans_win, asm_opt.max_n_chain, 1.0-asm_opt.trans_base_rate, 1.0-asm_opt.trans_base_rate, asm_opt.thread_num, ug, sg); + init_ug_trans_t(&sl, uopt, 0, asm_opt.trans_mer_length, asm_opt.trans_win, asm_opt.max_n_chain, + 1.0-asm_opt.trans_base_rate, 1.0-asm_opt.trans_base_rate, 1.0-asm_opt.trans_base_rate_sec, 1.0-asm_opt.trans_base_rate_sec, + 0.85, asm_opt.thread_num, 512, 3, 1, ug, sg, bub); + // gen_trans_base_count(&sl, res); + gen_trans_base_count_comp(&sl, res); + if(!bub) { + destory_bubbles(sl.bub); free(sl.bub); + } } diff --git a/inter.h b/inter.h index a240d95..0a6f961 100644 --- a/inter.h +++ b/inter.h @@ -2,6 +2,7 @@ #define __INTER__ #include "Overlaps.h" #include "Process_Read.h" +#include "hic.h" #define G_CHAIN_BW 16//128 #define FLANK_M (0x7fffU) @@ -116,5 +117,6 @@ void gen_ul_vec_rid_t(all_ul_t *x, All_reads *rdb, ma_ug_t *ug); void update_ug_arch_ul_mul(ma_ug_t *ug); void print_ul_alignment(ma_ug_t *ug, all_ul_t *aln, uint32_t id, const char* cmd); void clear_all_ul_t(all_ul_t *x); +void trans_base_infer(ma_ug_t *ug, asg_t *sg, ug_opt_t *uopt, kv_u_trans_t *res, bubble_type *bub); #endif diff --git a/rcut.cpp b/rcut.cpp index 0b716a8..c57327e 100644 --- a/rcut.cpp +++ b/rcut.cpp @@ -3241,6 +3241,15 @@ mc_clus_t *init_mc_clus_t(const mc_opt_t *opt, mc_g_t *mg, bubble_type* bub, uin return p; } +void des_mc_clus_t(mc_clus_t *p) +{ + if((!p)) return; + uint32_t k; free(p->lock); + for (k = 0; k < p->n_thread; k++) free(p->aux[k].vis.a); + free(p->aux); free(p->cc.ng.a); free(p->cc.nn.a); + free(p); +} + void mc_solve_core_adv(const mc_opt_t *opt, mc_g_t *mg, bubble_type* bub) { double index_time = yak_realtime(); @@ -3277,7 +3286,7 @@ void mc_solve_core_adv(const mc_opt_t *opt, mc_g_t *mg, bubble_type* bub) // if(bp) mc_solve_bp(bp); ///mc_write_info(g, b); - mc_svaux_destroy(b); + mc_svaux_destroy(b); des_mc_clus_t(bc); // if(bp) destroy_mc_bp_t(&bp); fprintf(stderr, "[M::%s::%.3f] ==> Partition\n", __func__, yak_realtime()-index_time); } @@ -4464,6 +4473,28 @@ void dump_bubble_type(bubble_type* bub, FILE *fp) dump_ma_ug_t(bub->ug, fp); } +void dump_rid(All_reads *rdb, FILE *fp) +{ + fwrite(&(rdb->total_reads), sizeof(rdb->total_reads), 1, fp); + fwrite(&(rdb->name_index_size), sizeof(rdb->name_index_size), 1, fp); + fwrite(rdb->name_index, sizeof((*(rdb->name_index))), rdb->name_index_size, fp); + + fwrite(&(rdb->total_name_length), sizeof(rdb->total_name_length), 1, fp); + fwrite(rdb->name, sizeof((*(rdb->name))), rdb->total_name_length, fp); +} + +void load_rid(All_reads *rdb, FILE *fp) +{ + fread(&(rdb->total_reads), sizeof(rdb->total_reads), 1, fp); + fread(&(rdb->name_index_size), sizeof(rdb->name_index_size), 1, fp); + MALLOC(rdb->name_index, rdb->name_index_size); + fread(rdb->name_index, sizeof((*(rdb->name_index))), rdb->name_index_size, fp); + + fread(&(rdb->total_name_length), sizeof(rdb->total_name_length), 1, fp); + MALLOC(rdb->name, rdb->total_name_length); + fread(rdb->name, sizeof((*(rdb->name))), rdb->total_name_length, fp); +} + void dump_debug_phasing(const char* fn, kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, double f_rate, uint32_t renew_s, int8_t *s, uint32_t is_sys, bubble_type* bub, kv_u_trans_t *ref) { @@ -4481,6 +4512,7 @@ uint32_t renew_s, int8_t *s, uint32_t is_sys, bubble_type* bub, kv_u_trans_t *re fwrite(&is_sys, sizeof(is_sys), 1, fp);///is_sys dump_bubble_type(bub, fp);///bub dump_kv_u_trans_t(ref, fp);///ta + dump_rid(&R_INF, fp); fclose(fp); free(buf); @@ -4586,15 +4618,43 @@ uint32_t *renew_s, int8_t **s, uint32_t *is_sys, bubble_type **bub, kv_u_trans_t CALLOC((*bub), 1); load_bubble_type(*bub, fp);///bub CALLOC((*ref), 1); load_kv_u_trans_t(*ref, fp);///ta + load_rid(&R_INF, fp);///read id + fclose(fp); free(buf); } +void prt_rcut_res(const char* fn, int8_t *s, ma_ug_t *ug, asg_t *sg) +{ + uint32_t i, k, flag = AMBIGU; + char *buf = (char*)calloc(strlen(fn) + 50, 1); + sprintf(buf, "%s.rcut.res.log", fn); + FILE *fp = fopen(buf, "w"); + uint8_t *rs = NULL; MALLOC(rs, sg->n_seq); + memset(rs, AMBIGU, sg->n_seq*sizeof((*rs))); + + for (i = 0; i < ug->g->n_seq; i++) { + if(ug->g->seq[i].del) continue; + flag = AMBIGU; + if(s[i] == 0) continue; + flag = (s[i] > 0? FATHER:MOTHER); + for (k = 0; k < ug->u.a[i].n; k++) rs[ug->u.a[i].a[k]>>33] = flag; + } + + for (i = 0; i < sg->n_seq; i++) { + if(rs[i] == AMBIGU) continue; + fprintf(fp, "%.*s\t%u\n", (int)Get_NAME_LENGTH(R_INF, i), Get_NAME(R_INF, i), rs[i]); + } + + fclose(fp); free(rs); + free(buf); +} + void quick_debug_phasing(const char* fn) { kv_u_trans_t *ta; ma_ug_t *ug; asg_t *read_g; double f_rate; - uint32_t renew_s; int8_t *s; uint32_t is_sys, k; bubble_type *bub; kv_u_trans_t *ref; + uint32_t renew_s; int8_t *s; uint32_t is_sys; bubble_type *bub; kv_u_trans_t *ref; load_debug_phasing(fn, &ta, &ug, &read_g, &f_rate, &renew_s, &s, &is_sys, &bub, &ref); @@ -4602,10 +4662,10 @@ void quick_debug_phasing(const char* fn) mc_solve(NULL, NULL, ta, ug, read_g, f_rate, NULL, renew_s, s, is_sys, bub, ref, 0, 0); // mc_solve_core_adv(const mc_opt_t *opt, mc_g_t *mg, bubble_type* bub, kv_u_trans_t *ref) - for (k = 0; k < ug->g->n_seq; k++) { - fprintf(stderr, "utg%.6dl(len::%u), s[k]::%d\n", (int32_t)(k)+1, ug->g->seq[k].len, s[k]); - } - + // for (k = 0; k < ug->g->n_seq; k++) { + // fprintf(stderr, "utg%.6dl(len::%u), s[k]::%d\n", (int32_t)(k)+1, ug->g->seq[k].len, s[k]); + // } + prt_rcut_res(fn, s, ug, read_g); exit(1); } diff --git a/sketch.cpp b/sketch.cpp index ed49ac6..3f21111 100644 --- a/sketch.cpp +++ b/sketch.cpp @@ -461,7 +461,7 @@ void sf##_ha_sketch(const char *str, int len, int w, int k, uint32_t rid, int is uint32_t buf_p[256], min_s = (uint32_t)-1;\ tiny_queue_t tq;\ assert(len > 0 && (int64_t)(len) < (int64_t)((((uint64_t)1)< 0 && w < 256) && (k > 0 && k <= 63));\ - if (dbg_ct != NULL) dbg_ct->a.n = 0;\ + if (dbg_ct != NULL) dbg_ct->a.n = 0;\ if (k_flag != NULL) {\ kv_resize_km(km, uint8_t, k_flag->a, (uint64_t)len);\ k_flag->a.n = len;\