Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes involving CRAM and header API for long references. #976

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 61 additions & 49 deletions cram/cram_decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,25 @@ static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_
return -1;
}

static int add_cigar(cram_slice *s, uint64_t cig_len, int cig_op) {
int nlen = 1 + cig_len/(1<<28);
if (s->ncigar + nlen >= s->cigar_alloc) {
s->cigar_alloc = (s->ncigar + nlen) < 1024 ? 1024 : (s->ncigar + nlen)*2;
uint32_t *c2 = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
if (!c2)
return -1;
s->cigar = c2;
}

while (cig_len) {
int sub_len = cig_len < (1<<28) ? cig_len : (1<<28)-1;
s->cigar[s->ncigar++] = (sub_len<<4) + cig_op;
cig_len -= sub_len;
}

return 0;
}

/*
* Internal part of cram_decode_slice().
* Generates the sequence, quality and cigar components.
Expand All @@ -1097,13 +1116,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
int has_MD, int has_NM) {
int prev_pos = 0, f, r = 0, out_sz = 1;
int seq_pos = 1;
int cig_len = 0;
uint64_t cig_len = 0;
int64_t ref_pos = cr->apos;
int32_t fn, i32;
enum cigar_op cig_op = BAM_CMATCH;
uint32_t *cigar = s->cigar;
uint32_t ncigar = s->ncigar;
uint32_t cigar_alloc = s->cigar_alloc;
uint32_t nm = 0;
int32_t md_dist = 0;
int orig_aux = 0;
Expand Down Expand Up @@ -1134,7 +1150,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
}

ref_pos--; // count from 0
cr->cigar = ncigar;
cr->cigar = s->ncigar;

if (!(ds & (CRAM_FC | CRAM_FP)))
goto skip_cigar;
Expand All @@ -1143,13 +1159,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
int32_t pos = 0;
char op;

if (ncigar+2 >= cigar_alloc) {
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
return -1;
s->cigar = cigar;
}

if (ds & CRAM_FC) {
if (!c->comp_hdr->codecs[DS_FC]) return -1;
r |= c->comp_hdr->codecs[DS_FC]->decode(s,
Expand Down Expand Up @@ -1231,13 +1240,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
}
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
cig_op = BAM_CBASE_MATCH;
#else
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
cig_op = BAM_CMATCH;
Expand All @@ -1258,7 +1269,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
int have_sc = 0;

if (cig_len) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
switch (CRAM_MAJOR_VERS(fd->version)) {
Expand Down Expand Up @@ -1297,7 +1309,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
}
if (have_sc) {
if (r) return r;
cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
if (add_cigar(s, out_sz2, BAM_CSOFT_CLIP) < 0)
goto err;
cig_op = BAM_CSOFT_CLIP;
seq_pos += out_sz2;
}
Expand All @@ -1308,7 +1321,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
unsigned char base;
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
if (ds & CRAM_BS) {
Expand All @@ -1323,7 +1337,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
#else
int ref_base;
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
if (ds & CRAM_BS) {
Expand Down Expand Up @@ -1365,7 +1380,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,

case 'D': { // Deletion; DL
if (cig_len && cig_op != BAM_CDEL) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
if (ds & CRAM_DL) {
Expand Down Expand Up @@ -1419,7 +1435,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
int32_t out_sz2 = 1;

if (cig_len && cig_op != BAM_CINS) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}

Expand All @@ -1441,7 +1458,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,

case 'i': { // Insertion (single base); BA
if (cig_len && cig_op != BAM_CINS) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
if (ds & CRAM_BA) {
Expand All @@ -1463,7 +1481,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
int32_t len = 1;

if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}

Expand Down Expand Up @@ -1513,7 +1532,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
int32_t len = 1;

if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}

Expand All @@ -1537,12 +1557,14 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
case 'B': { // Read base; BA, QS
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
#else
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
#endif
Expand Down Expand Up @@ -1601,7 +1623,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,

case 'H': { // hard clip; HC
if (cig_len && cig_op != BAM_CHARD_CLIP) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
if (ds & CRAM_HC) {
Expand All @@ -1618,7 +1641,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,

case 'P': { // padding; PD
if (cig_len && cig_op != BAM_CPAD) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
if (ds & CRAM_PD) {
Expand All @@ -1635,7 +1659,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,

case 'N': { // Ref skip; RS
if (cig_len && cig_op != BAM_CREF_SKIP) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
if (ds & CRAM_RS) {
Expand Down Expand Up @@ -1722,21 +1747,17 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
ref_pos += cr->len - seq_pos + 1;
}

if (ncigar+1 >= cigar_alloc) {
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
return -1;
s->cigar = cigar;
}
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
cig_op = BAM_CBASE_MATCH;
#else
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
cig_len = 0;
}
cig_op = BAM_CMATCH;
Expand All @@ -1752,17 +1773,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
}

if (cig_len) {
if (ncigar >= cigar_alloc) {
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
return -1;
s->cigar = cigar;
}

cigar[ncigar++] = (cig_len<<4) + cig_op;
if (add_cigar(s, cig_len, cig_op) < 0)
goto err;
}

cr->ncigar = ncigar - cr->cigar;
cr->ncigar = s->ncigar - cr->cigar;
cr->aend = ref_pos;

//printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
Expand All @@ -1787,10 +1802,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
}
}

s->cigar = cigar;
s->cigar_alloc = cigar_alloc;
s->ncigar = ncigar;

if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
cr->len = 0;

Expand Down Expand Up @@ -1834,6 +1845,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
hts_log_error("CRAM CIGAR extends beyond slice reference extents");
return -1;

err:
block_err:
return -1;
}
Expand Down
4 changes: 4 additions & 0 deletions cram/cram_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2773,6 +2773,10 @@ static int process_one_read(cram_fd *fd, cram_container *c,

c->num_bases += cr->len;
cr->apos = bam_pos(b)+1;
if (cr->apos >= UINT32_MAX) {
hts_log_error("Sequence position out of valid range");
goto err;
}
if (c->pos_sorted) {
if (cr->apos < s->last_apos) {
c->pos_sorted = 0;
Expand Down
32 changes: 4 additions & 28 deletions header.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ typedef khash_t(rm) rmhash_t;
static int sam_hdr_link_pg(sam_hdr_t *bh);

static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);


#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
Expand Down Expand Up @@ -182,14 +181,10 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
if (hrecs->ref[nref].ty == NULL) {
// Attach header line to existing stub entry.
hrecs->ref[nref].ty = h_type;
// Check lengths match; correct if not.
if (len != hrecs->ref[nref].len) {
char tmp[32];
snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
hrecs->ref[nref].len);
if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
return -1;
}
// The old hrecs length may be incorrect as it is initially
// populated from h->target_len which has a max 32-bit size.
// We always take our latest data as canonical.
hrecs->ref[nref].len = len;
if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
return -1;

Expand Down Expand Up @@ -2485,25 +2480,6 @@ int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
return 0;
}

/*
* Adds or updates tag key,value pairs in a header line.
* Eg for adding M5 tags to @SQ lines or updating sort order for the
* @HD line.
*
* Specify multiple key,value pairs ending in NULL.
*
* Returns 0 on success
* -1 on failure
*/
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
va_list args;
int res;
va_start(args, type);
res = sam_hrecs_vupdate(hrecs, type, args);
va_end(args);
return res;
}

/*
* Looks for a specific key in a single sam header line identified by *type.
* If prev is non-NULL it also fills this out with the previous tag, to
Expand Down
Loading