From 1238a537a7e86f0d2179070b862e5d2f17d0b4f4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 10 Jan 2024 00:02:11 -0500 Subject: [PATCH] r191: added -S to reject overlap between strands --- graph.c | 8 ++++---- main.c | 3 ++- overlap.c | 6 ++++-- pangene.h | 3 ++- pgpriv.h | 4 ++-- read.c | 6 +++--- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/graph.c b/graph.c index 133dbf5..d59df2b 100644 --- a/graph.c +++ b/graph.c @@ -6,7 +6,7 @@ void pg_post_process(const pg_opt_t *opt, pg_data_t *d) { - int32_t i, j; + int32_t i, j, check_strand = !!(opt->flag&PG_F_CHECK_STRAND); if (pg_verbose >= 3) fprintf(stderr, "[M::%s::%s] %d genes and %d proteins\n", __func__, pg_timestamp(), d->n_gene, d->n_prot); pg_cap_score_dom(d); @@ -22,7 +22,7 @@ void pg_post_process(const pg_opt_t *opt, pg_data_t *d) int32_t n_shadow, tot; for (i = 0, tot = 0; i < g->n_hit; ++i) if (!g->hit[i].flt) ++tot; - n_shadow = pg_shadow(opt, d, j, 0); + n_shadow = pg_shadow(opt, d, j, 0, check_strand); fprintf(stderr, "[M::%s::%s] genome[%d]: %s; %d hits remain, of which %d are shadowed\n", __func__, pg_timestamp(), j, g->label, tot, n_shadow); } @@ -86,7 +86,7 @@ static inline int32_t pg_get_score(const pg_graph_t *q, const pg_hit_t *a, int32 void pg_gen_arc(const pg_opt_t *opt, pg_graph_t *q) { - int32_t j, i, i0, *seg_cnt = 0, use_ori = !!(opt->flag&PG_F_ORI_FOR_BRANCH); + int32_t j, i, i0, *seg_cnt = 0, use_ori = !!(opt->flag&PG_F_ORI_FOR_BRANCH), check_strand = !!(opt->flag&PG_F_CHECK_STRAND); int64_t n_arc = 0, m_arc = 0, n_arc1 = 0, m_arc1 = 0; pg_tmparc_t *p, *arc = 0, *arc1 = 0; @@ -99,7 +99,7 @@ void pg_gen_arc(const pg_opt_t *opt, pg_graph_t *q) uint32_t w, v = (uint32_t)-1; int64_t vpos = -1; int32_t vcid = -1, si = -1; - pg_shadow(opt, q->d, j, 0); + pg_shadow(opt, q->d, j, 0, check_strand); pg_hit_sort(g, 1); // sort by pg_hit_t::cm n_arc1 = 0; memset(seg_cnt, 0, q->n_seg * sizeof(int32_t)); diff --git a/main.c b/main.c index 60da6a8..71e8434 100644 --- a/main.c +++ b/main.c @@ -66,7 +66,7 @@ int main(int argc, char *argv[]) pg_data_t *d; pg_opt_init(&opt); - while ((c = ketopt(&o, argc, argv, 1, "d:e:l:f:g:p:b:B:y:Fr:c:a:wv:GD:C:T:X:I:P:m:JO", long_options)) >= 0) { + while ((c = ketopt(&o, argc, argv, 1, "d:e:l:f:g:p:b:B:y:Fr:c:a:wv:GD:C:T:X:I:P:m:JOS", long_options)) >= 0) { // input options if (c == 'd') opt.gene_delim = *o.arg; else if (c == 'X') opt.excl = pg_read_list_dict(o.arg); @@ -90,6 +90,7 @@ int main(int argc, char *argv[]) else if (c == 'F') opt.flag |= PG_F_FRAG_MODE; else if (c == 'D') opt.local_dist = pg_parse_num(o.arg); else if (c == 'C') opt.local_count = atoi(o.arg); + else if (c == 'S') opt.flag |= PG_F_CHECK_STRAND; // output options else if (c == 'w') opt.flag |= PG_F_WRITE_NO_WALK; else if (c == 'G') opt.flag |= PG_F_WRITE_VTX_SEL; diff --git a/overlap.c b/overlap.c index a69f991..c52510e 100644 --- a/overlap.c +++ b/overlap.c @@ -55,7 +55,7 @@ static inline int32_t pg_cds_len(const pg_hit_t *a, const pg_exon_t *e) */ // select among overlapping isoforms of the same gene -int32_t pg_flt_ov_isoform(const pg_opt_t *opt, pg_data_t *d, int32_t id) +int32_t pg_flt_ov_isoform(const pg_opt_t *opt, pg_data_t *d, int32_t id, int32_t check_strand) { const pg_prot_t *prot = d->prot; pg_genome_t *g = &d->genome[id]; @@ -75,6 +75,7 @@ int32_t pg_flt_ov_isoform(const pg_opt_t *opt, pg_data_t *d, int32_t id) if (aj->flt || aj->ce <= ai->cs) continue; // no overlap gj = prot[aj->pid].gid; if (gi != gj) continue; // ignore isoforms from different genes + if (check_strand && ai->rev != aj->rev) continue; hj = pg_hash_uint32(aj->pid); x = pg_hit_overlap(g, aj, ai); if (x>>32 == 0) continue; // no overlap on CDS @@ -97,7 +98,7 @@ typedef struct { } shadow_aux_t; // test overlap between same or different genes -int32_t pg_shadow(const pg_opt_t *opt, pg_data_t *d, int32_t id, int32_t cal_dom_sc) +int32_t pg_shadow(const pg_opt_t *opt, pg_data_t *d, int32_t id, int32_t cal_dom_sc, int32_t check_strand) { const pg_prot_t *prot = d->prot; pg_genome_t *g = &d->genome[id]; @@ -124,6 +125,7 @@ int32_t pg_shadow(const pg_opt_t *opt, pg_data_t *d, int32_t id, int32_t cal_dom pg_hit_t *aj = &g->hit[j]; if (aj->ce <= ai->cs) continue; // no overlap if (aj->flt) continue; + if (check_strand && ai->rev != aj->rev) continue; gj = prot[aj->pid].gid; hj = pg_hash_uint32(aj->pid); x = pg_hit_overlap(g, aj, ai); diff --git a/pangene.h b/pangene.h index 6e8e0b4..bf337c8 100644 --- a/pangene.h +++ b/pangene.h @@ -3,7 +3,7 @@ #include -#define PG_VERSION "1.0-r186-dirty" +#define PG_VERSION "1.0-r191-dirty" #define PG_F_WRITE_BED_RAW 0x1 #define PG_F_WRITE_BED_WALK 0x2 @@ -13,6 +13,7 @@ #define PG_F_FRAG_MODE 0x20 #define PG_F_NO_JOINT_PSEUDO 0x40 #define PG_F_ORI_FOR_BRANCH 0x80 +#define PG_F_CHECK_STRAND 0x100 typedef struct { uint64_t x, y; diff --git a/pgpriv.h b/pgpriv.h index 1c5346b..334acd1 100644 --- a/pgpriv.h +++ b/pgpriv.h @@ -80,8 +80,8 @@ void pg_gen_vtx(const pg_opt_t *opt, pg_graph_t *q); int32_t pg_mark_branch_flt_arc(const pg_opt_t *opt, pg_graph_t *q); int32_t pg_mark_branch_flt_hit(const pg_opt_t *opt, pg_graph_t *q); -int32_t pg_flt_ov_isoform(const pg_opt_t *opt, pg_data_t *d, int32_t id); -int32_t pg_shadow(const pg_opt_t *opt, pg_data_t *d, int32_t id, int32_t cal_dom_sc); +int32_t pg_flt_ov_isoform(const pg_opt_t *opt, pg_data_t *d, int32_t id, int32_t check_stand); +int32_t pg_shadow(const pg_opt_t *opt, pg_data_t *d, int32_t id, int32_t cal_dom_sc, int32_t check_strand); char *pg_strdup(const char *src); diff --git a/read.c b/read.c index 674b9d2..e7c6a9b 100644 --- a/read.c +++ b/read.c @@ -109,7 +109,7 @@ int32_t pg_read_paf(const pg_opt_t *opt, pg_data_t *d, const char *fn) gzFile fp; kstream_t *ks; kstring_t str = {0,0,0}; - int32_t dret, absent, n_tot = 0; + int32_t dret, absent, n_tot = 0, check_strand = !!(opt->flag&PG_F_CHECK_STRAND); void *d_ctg, *hit_rank; pg_genome_t *g; pg_exons_t buf = {0,0,0}; @@ -245,13 +245,13 @@ int32_t pg_read_paf(const pg_opt_t *opt, pg_data_t *d, const char *fn) n_pseudo = pg_flag_pseudo(d->prot, g); PG_SET_FILTER(d, pseudo == 1); pg_hit_sort(g, 0); - pg_shadow(opt, d, d->n_genome - 1, 1); + pg_shadow(opt, d, d->n_genome - 1, 1, check_strand); for (i = 0; i < g->n_hit; ++i) { pg_hit_t *a = &g->hit[i]; a->pid_dom0 = a->pid_dom; a->pid_dom = -1, a->shadow = 0; // reset } - n_flt_ov_iso = pg_flt_ov_isoform(opt, d, d->n_genome - 1); + n_flt_ov_iso = pg_flt_ov_isoform(opt, d, d->n_genome - 1, check_strand); n_flt_chain = pg_flt_chain_shadow(d->prot, d->n_prot, g); n_flt_subopt = pg_flt_subopt_isoform(d->prot, d->n_gene, g); if (pg_verbose >= 3)