diff --git a/Makefile b/Makefile index 3a3bb23..72aca73 100644 --- a/Makefile +++ b/Makefile @@ -54,6 +54,7 @@ SOURCES = align\ motif_map\ multi_partition\ needle\ + normalize\ ordered_bcf_overlap_matcher\ ordered_region_overlap_matcher\ partition\ @@ -96,7 +97,6 @@ SOURCES = align\ tbx_ordered_reader\ ahmm\ xcmp\ - normalize\ SOURCESONLY = main.cpp diff --git a/annotate_indels.cpp b/annotate_indels.cpp index 7173735..aaa6622 100644 --- a/annotate_indels.cpp +++ b/annotate_indels.cpp @@ -185,10 +185,6 @@ class Igor : Program std::clog << " [m] mode " << mode << "\n"; print_boo_op(" [d] debug ", debug); print_ref_op(" [r] ref FASTA file ", ref_fasta_file); - - std::clog << " [m] mode dfsdasdad " << mode << "\n"; - - print_boo_op(" [x] override tag ", override_tag); print_boo_op(" [v] add vntr record ", add_vntr_record); @@ -205,18 +201,19 @@ class Igor : Program void insert_vntr_record(bcf_hdr_t* h, bcf1_t *v, Variant& variant) { -// bcf_update_info_string(h, v, MOTIF.c_str(), vntr.motif.c_str()); -// bcf_update_info_string(h, v, RU.c_str(), vntr.ru.c_str()); -// bcf_update_info_float(h, v, RL.c_str(), &vntr.rl, 1); -// bcf_update_info_string(h, v, REF.c_str(), vntr.repeat_tract.seq.c_str()); -// bcf_update_info_int32(h, v, REFPOS.c_str(), &vntr.repeat_tract.pos1, 1); -// kstring_t new_alleles = {0,0,0}; -// kputs(vntr.repeat_tract.seq.c_str(), &new_alleles); -// kputc(',', &new_alleles); -// kputs("", &new_alleles); -// bcf_update_alleles_str(h, v, new_alleles.s); -// bcf_set_pos1(v, vntr.repeat_tract.pos1); -// if (new_alleles.m) free(new_alleles.s); + VNTR& vntr = variant.vntr; + bcf_update_info_string(h, v, MOTIF.c_str(), vntr.motif.c_str()); + bcf_update_info_string(h, v, RU.c_str(), vntr.ru.c_str()); + bcf_update_info_float(h, v, RL.c_str(), &vntr.rl, 1); + bcf_update_info_string(h, v, REF.c_str(), vntr.repeat_tract.seq.c_str()); + bcf_update_info_int32(h, v, REFPOS.c_str(), &vntr.repeat_tract.pos1, 1); + kstring_t new_alleles = {0,0,0}; + kputs(vntr.repeat_tract.seq.c_str(), &new_alleles); + kputc(',', &new_alleles); + kputs("", &new_alleles); + bcf_update_alleles_str(h, v, new_alleles.s); + bcf_set_pos1(v, vntr.repeat_tract.pos1); + if (new_alleles.m) free(new_alleles.s); } void annotate_indels() @@ -240,6 +237,7 @@ class Igor : Program if (add_vntr_record) { + insert_vntr_record(odr->hdr, v, variant); odw->write(v); v = odw->get_bcf1_from_pool(); diff --git a/lib/htslib/cram/cram_decode.c b/lib/htslib/cram/cram_decode.c index 389dcfc..47a5130 100644 --- a/lib/htslib/cram/cram_decode.c +++ b/lib/htslib/cram/cram_decode.c @@ -726,7 +726,7 @@ int cram_dependent_data_series(cram_fd *fd, fd->decode_md = 0; if (fd->required_fields & SAM_QUAL) - hdr->data_series |= CRAM_SEQ; + hdr->data_series |= CRAM_QUAL; if (fd->required_fields & SAM_AUX) hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux; diff --git a/lib/htslib/cram/cram_io.c b/lib/htslib/cram/cram_io.c index f35cf99..95101e0 100644 --- a/lib/htslib/cram/cram_io.c +++ b/lib/htslib/cram/cram_io.c @@ -3014,6 +3014,9 @@ int cram_write_container(cram_fd *fd, cram_container *c) { static int cram_flush_container2(cram_fd *fd, cram_container *c) { int i, j; + if (c->curr_slice > 0 && !c->slices) + return -1; + //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum); /* Write the container struct itself */ @@ -4255,7 +4258,7 @@ int cram_eof(cram_fd *fd) { * Returns 0 on success * -1 on failure */ -int cram_set_option(cram_fd *fd, enum cram_option opt, ...) { +int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) { int r; va_list args; @@ -4273,7 +4276,7 @@ int cram_set_option(cram_fd *fd, enum cram_option opt, ...) { * Returns 0 on success * -1 on failure */ -int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { +int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { refs_t *refs; if (!fd) @@ -4406,6 +4409,10 @@ int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { fd->required_fields = va_arg(args, int); break; + case HTS_OPT_COMPRESSION_LEVEL: + fd->level = va_arg(args, int); + break; + default: fprintf(stderr, "Unknown CRAM option code %d\n", opt); return -1; diff --git a/lib/htslib/cram/cram_io.h b/lib/htslib/cram/cram_io.h index f1ed5c9..9e08a6d 100644 --- a/lib/htslib/cram/cram_io.h +++ b/lib/htslib/cram/cram_io.h @@ -609,7 +609,7 @@ int cram_eof(cram_fd *fd); * Returns 0 on success; * -1 on failure */ -int cram_set_option(cram_fd *fd, enum cram_option opt, ...); +int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...); /*! Sets options on the cram_fd. * @@ -620,7 +620,7 @@ int cram_set_option(cram_fd *fd, enum cram_option opt, ...); * Returns 0 on success; * -1 on failure */ -int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args); +int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args); /*! * Attaches a header to a cram_fd. diff --git a/lib/htslib/cram/cram_structs.h b/lib/htslib/cram/cram_structs.h index 8a9548d..038000e 100644 --- a/lib/htslib/cram/cram_structs.h +++ b/lib/htslib/cram/cram_structs.h @@ -771,8 +771,10 @@ enum cram_fields { #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) -#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \ - CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ) +#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \ + CRAM_RL | CRAM_AP | CRAM_BB) + +#define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ) /* BF bitfields */ /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ diff --git a/lib/htslib/hts.c b/lib/htslib/hts.c index 6341624..531d807 100644 --- a/lib/htslib/hts.c +++ b/lib/htslib/hts.c @@ -363,15 +363,63 @@ char *hts_format_description(const htsFormat *format) return ks_release(&str); } -htsFile *hts_open(const char *fn, const char *mode) +htsFile *hts_open_format(const char *fn, const char *mode, htsFormat *fmt) { + char smode[102], *cp, *cp2, *mode_c; htsFile *fp = NULL; - hFILE *hfile = hopen(fn, mode); + hFILE *hfile; + char fmt_code = '\0'; + + strncpy(smode, mode, 100); + smode[100]=0; + if ((cp = strchr(smode, ','))) + *cp = '\0'; + + // Migrate format code (b or c) to the end of the smode buffer. + for (cp2 = cp = smode; *cp; cp++) { + if (*cp == 'b') + fmt_code = 'b'; + else if (*cp == 'c') + fmt_code = 'c'; + else + *cp2++ = *cp; + } + mode_c = cp2; + *cp2++ = fmt_code; + *cp2++ = 0; + *cp2++ = 0; + + // Attempt to auto-detect based on filename + if (fmt && fmt->format == unknown_format) { + const char *ext = strrchr(fn, '.'); + if (ext) { + ext++; + if (strcmp(ext, "sam") == 0) + fmt->format = sam; + else if (strcmp(ext, "bam") == 0) + fmt->format = bam; + else if (strcmp(ext, "cram") == 0) + fmt->format = cram; + else if (strcmp(ext, "vcf") == 0) + fmt->format = vcf; + else if (strcmp(ext, "bcf") == 0) + fmt->format = bcf; + } + } + + // Set or reset the format code if opts->format is used + if (fmt && fmt->format != unknown_format) + *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format]; + + hfile = hopen(fn, smode); if (hfile == NULL) goto error; - fp = hts_hopen(hfile, fn, mode); + fp = hts_hopen(hfile, fn, smode); if (fp == NULL) goto error; + if (fmt && fmt->specific) + hts_opt_apply(fp, fmt->specific); + return fp; error: @@ -384,28 +432,297 @@ htsFile *hts_open(const char *fn, const char *mode) return NULL; } +htsFile *hts_open(const char *fn, const char *mode) { + return hts_open_format(fn, mode, NULL); +} + +/* + * Parses arg and appends it to the option list. + * + * Returns 0 on success; + * -1 on failure. + */ +int hts_opt_add(hts_opt **opts, const char *c_arg) { + hts_opt *o, *t; + char *val; + + if (!c_arg) + return -1; + + if (!(o = malloc(sizeof(*o)))) + return -1; + + if (!(o->arg = strdup(c_arg))) { + free(o); + return -1; + } + + if (!(val = strchr(o->arg, '='))) + val = "1"; // assume boolean + else + *val++ = '\0'; + + if (strcmp(o->arg, "decode_md") == 0 || + strcmp(o->arg, "DECODE_MD") == 0) + o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val); + + else if (strcmp(o->arg, "verbosity") == 0 || + strcmp(o->arg, "VERBOSITY") == 0) + o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val); + + else if (strcmp(o->arg, "seqs_per_slice") == 0 || + strcmp(o->arg, "SEQS_PER_SLICE") == 0) + o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val); + + else if (strcmp(o->arg, "slices_per_container") == 0 || + strcmp(o->arg, "SLICES_PER_CONTAINER") == 0) + o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val); + + else if (strcmp(o->arg, "embed_ref") == 0 || + strcmp(o->arg, "EMBED_REF") == 0) + o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val); + + else if (strcmp(o->arg, "no_ref") == 0 || + strcmp(o->arg, "NO_REF") == 0) + o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val); + + else if (strcmp(o->arg, "ignore_md5") == 0 || + strcmp(o->arg, "IGNORE_MD5") == 0) + o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_bzip2") == 0 || + strcmp(o->arg, "USE_BZIP2") == 0) + o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_rans") == 0 || + strcmp(o->arg, "USE_RANS") == 0) + o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_lzma") == 0 || + strcmp(o->arg, "USE_LZMA") == 0) + o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val); + + else if (strcmp(o->arg, "reference") == 0 || + strcmp(o->arg, "REFERENCE") == 0) + o->opt = CRAM_OPT_REFERENCE, o->val.s = val; + + else if (strcmp(o->arg, "version") == 0 || + strcmp(o->arg, "VERSION") == 0) + o->opt = CRAM_OPT_VERSION, o->val.s =val; + + else if (strcmp(o->arg, "multi_seq_per_slice") == 0 || + strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0) + o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val); + + else if (strcmp(o->arg, "nthreads") == 0 || + strcmp(o->arg, "NTHREADS") == 0) + o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val); + + else if (strcmp(o->arg, "required_fields") == 0 || + strcmp(o->arg, "REQUIRED_FIELDS") == 0) + o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0); + + else { + fprintf(stderr, "Unknown option '%s'\n", o->arg); + free(o->arg); + free(o); + return -1; + } + + o->next = NULL; + + // Append; assumes small list. + if (*opts) { + t = *opts; + while (t->next) + t = t->next; + t->next = o; + } else { + *opts = o; + } + + return 0; +} + +/* + * Applies an hts_opt option list to a given htsFile. + * + * Returns 0 on success + * -1 on failure + */ +int hts_opt_apply(htsFile *fp, hts_opt *opts) { + hts_opt *last = NULL; + + for (; opts; opts = (last=opts)->next) + if (hts_set_opt(fp, opts->opt, opts->val) != 0) + return -1; + + return 0; +} + +/* + * Frees an hts_opt list. + */ +void hts_opt_free(hts_opt *opts) { + hts_opt *last = NULL; + while (opts) { + opts = (last=opts)->next; + free(last->arg); + free(last); + } +} + + +/* + * Tokenise options as (key(=value)?,)*(key(=value)?)? + * NB: No provision for ',' appearing in the value! + * Add backslashing rules? + * + * This could be used as part of a general command line option parser or + * as a string concatenated onto the file open mode. + * + * Returns 0 on success + * -1 on failure. + */ +int hts_parse_opt_list(htsFormat *fmt, const char *str) { + while (str && *str) { + const char *str_start; + int len; + char arg[8001]; + + while (*str && *str == ',') + str++; + + for (str_start = str; *str && *str != ','; str++); + len = str - str_start; + + // Produce a nul terminated copy of the option + strncpy(arg, str_start, len < 8000 ? len : 8000); + arg[len < 8000 ? len : 8000] = '\0'; + + if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0) + return -1; + + if (*str) + str++; + } + + return 0; +} + +/* + * Accepts a string file format (sam, bam, cram, vcf, bam) optionally + * followed by a comma separated list of key=value options and splits + * these up into the fields of htsFormat struct. + * + * format is assumed to be already initialised, either to blank + * "unknown" values or via previous hts_opt_add calls. + * + * Returns 0 on success + * -1 on failure. + */ +int hts_parse_format(htsFormat *format, const char *str) { + const char *cp; + + if (!(cp = strchr(str, ','))) + cp = str+strlen(str); + + format->version.minor = 0; // unknown + format->version.major = 0; // unknown + + if (strncmp(str, "sam", cp-str) == 0) { + format->category = sequence_data; + format->format = sam; + format->compression = no_compression;; + format->compression_level = 0; + } else if (strncmp(str, "bam", cp-str) == 0) { + format->category = sequence_data; + format->format = bam; + format->compression = bgzf; + format->compression_level = -1; + } else if (strncmp(str, "cram", cp-str) == 0) { + format->category = sequence_data; + format->format = cram; + format->compression = custom; + format->compression_level = -1; + } else if (strncmp(str, "vcf", cp-str) == 0) { + format->category = variant_data; + format->format = vcf; + format->compression = no_compression;; + format->compression_level = 0; + } else if (strncmp(str, "bcf", cp-str) == 0) { + format->category = variant_data; + format->format = bcf; + format->compression = bgzf; + format->compression_level = -1; + } else { + return -1; + } + + return hts_parse_opt_list(format, cp); +} + + +/* + * Tokenise options as (key(=value)?,)*(key(=value)?)? + * NB: No provision for ',' appearing in the value! + * Add backslashing rules? + * + * This could be used as part of a general command line option parser or + * as a string concatenated onto the file open mode. + * + * Returns 0 on success + * -1 on failure. + */ +static int hts_process_opts(htsFile *fp, const char *opts) { + htsFormat fmt; + + fmt.specific = NULL; + if (hts_parse_opt_list(&fmt, opts) != 0) + return -1; + + hts_opt_apply(fp, fmt.specific); + hts_opt_free(fmt.specific); + + return 0; +} + + htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) { htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile)); + char simple_mode[101], *cp, *opts; + simple_mode[100] = '\0'; + if (fp == NULL) goto error; fp->fn = strdup(fn); fp->is_be = ed_is_big(); - if (strchr(mode, 'r')) { + // Split mode into simple_mode,opts strings + if ((cp = strchr(mode, ','))) { + strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100); + simple_mode[cp-mode] = '\0'; + opts = cp+1; + } else { + strncpy(simple_mode, mode, 100); + opts = NULL; + } + + if (strchr(simple_mode, 'r')) { if (hts_detect_format(hfile, &fp->format) < 0) goto error; } - else if (strchr(mode, 'w') || strchr(mode, 'a')) { + else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) { htsFormat *fmt = &fp->format; fp->is_write = 1; - if (strchr(mode, 'b')) fmt->format = binary_format; - else if (strchr(mode, 'c')) fmt->format = cram; + if (strchr(simple_mode, 'b')) fmt->format = binary_format; + else if (strchr(simple_mode, 'c')) fmt->format = cram; else fmt->format = text_format; - if (strchr(mode, 'z')) fmt->compression = bgzf; - else if (strchr(mode, 'g')) fmt->compression = gzip; - else if (strchr(mode, 'u')) fmt->compression = no_compression; + if (strchr(simple_mode, 'z')) fmt->compression = bgzf; + else if (strchr(simple_mode, 'g')) fmt->compression = gzip; + else if (strchr(simple_mode, 'u')) fmt->compression = no_compression; else { // No compression mode specified, set to the default for the format switch (fmt->format) { @@ -429,13 +746,13 @@ htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) case binary_format: case bam: case bcf: - fp->fp.bgzf = bgzf_hopen(hfile, mode); + fp->fp.bgzf = bgzf_hopen(hfile, simple_mode); if (fp->fp.bgzf == NULL) goto error; fp->is_bin = 1; break; case cram: - fp->fp.cram = cram_dopen(hfile, fn, mode); + fp->fp.cram = cram_dopen(hfile, fn, simple_mode); if (fp->fp.cram == NULL) goto error; if (!fp->is_write) cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); @@ -447,7 +764,7 @@ htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) case vcf: if (!fp->is_write) { #if KS_BGZF - BGZF *gzfp = bgzf_hopen(hfile, mode); + BGZF *gzfp = bgzf_hopen(hfile, simple_mode); #else // TODO Implement gzip hFILE adaptor hclose(hfile); // This won't work, especially for stdin @@ -457,7 +774,7 @@ htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) else goto error; } else if (fp->format.compression != no_compression) { - fp->fp.bgzf = bgzf_hopen(hfile, mode); + fp->fp.bgzf = bgzf_hopen(hfile, simple_mode); if (fp->fp.bgzf == NULL) goto error; } else @@ -468,6 +785,9 @@ htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) goto error; } + if (opts) + hts_process_opts(fp, opts); + return fp; error: @@ -545,10 +865,37 @@ const htsFormat *hts_get_format(htsFile *fp) return fp? &fp->format : NULL; } -int hts_set_opt(htsFile *fp, enum cram_option opt, ...) { +const char *hts_format_file_extension(const htsFormat *format) { + if (!format) + return "?"; + + switch (format->format) { + case sam: return "sam"; + case bam: return "bam"; + case bai: return "bai"; + case cram: return "cram"; + case crai: return "crai"; + case vcf: return "vcf"; + case bcf: return "bcf"; + case csi: return "csi"; + case gzi: return "gzi"; + case tbi: return "tbi"; + case bed: return "bed"; + default: return "?"; + } +} + +int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { int r; va_list args; + if (opt == HTS_OPT_NTHREADS) { + va_start(args, opt); + int nthreads = va_arg(args, int); + va_end(args); + return hts_set_threads(fp, nthreads); + } + if (fp->format.format != cram) return 0; diff --git a/lib/htslib/htsfile.c b/lib/htslib/htsfile.c index 8c50cb1..4ce7a6c 100644 --- a/lib/htslib/htsfile.c +++ b/lib/htslib/htsfile.c @@ -37,52 +37,108 @@ DEALINGS IN THE SOFTWARE. */ enum { identify, view_headers, view_all } mode = identify; int show_headers = 1; +int status = EXIT_SUCCESS; /* Exit status from main */ static htsFile *dup_stdout(const char *mode) { int fd = dup(STDOUT_FILENO); - hFILE *hfp = (fd >= 0)? hdopen(fd, mode) : NULL; + if (fd < 0) { + perror("htsfile: Couldn't duplicate stdout"); + return NULL; + } + hFILE *hfp = hdopen(fd, mode); return hfp? hts_hopen(hfp, "-", mode) : NULL; } static int view_sam(hFILE *hfp, const char *filename) { samFile *in = hts_hopen(hfp, filename, "r"); - if (in == NULL) return 0; - samFile *out = dup_stdout("w"); - bam_hdr_t *hdr = sam_hdr_read(in); + bam_hdr_t *hdr = NULL; + samFile *out = NULL; + if (in == NULL) { + status = EXIT_FAILURE; + return 0; + } + hdr = sam_hdr_read(in); + if (hdr == NULL) { + status = EXIT_FAILURE; + goto clean; + } + out = dup_stdout("w"); + if (out == NULL) { + status = EXIT_FAILURE; + goto clean; + } - if (show_headers) sam_hdr_write(out, hdr); + if (show_headers) { + if (sam_hdr_write(out, hdr) != 0) { + status = EXIT_FAILURE; + goto clean; + } + } if (mode == view_all) { bam1_t *b = bam_init1(); - while (sam_read1(in, hdr, b) >= 0) - sam_write1(out, hdr, b); + int ret; + while ((ret = sam_read1(in, hdr, b)) >= 0) { + if (sam_write1(out, hdr, b) < 0) { + status = EXIT_FAILURE; + goto clean; + } + } bam_destroy1(b); + if (ret != -1) // eof + status = EXIT_FAILURE; } - bam_hdr_destroy(hdr); - hts_close(out); - hts_close(in); + clean: + if (hdr != NULL) bam_hdr_destroy(hdr); + if (out != NULL && hts_close(out) != 0) + status = EXIT_FAILURE; + if (hts_close(in) != 0) + status = EXIT_FAILURE; return 1; } static int view_vcf(hFILE *hfp, const char *filename) { vcfFile *in = hts_hopen(hfp, filename, "r"); - if (in == NULL) return 0; - vcfFile *out = dup_stdout("w"); - bcf_hdr_t *hdr = bcf_hdr_read(in); + bcf_hdr_t *hdr = NULL; + vcfFile *out = NULL; + if (in == NULL) { + status = EXIT_FAILURE; + return 0; + } + hdr = bcf_hdr_read(in); + if (hdr == NULL) { + status = EXIT_FAILURE; + goto clean; + } + out = dup_stdout("w"); + if (out == NULL) { + status = EXIT_FAILURE; + goto clean; + } - if (show_headers) bcf_hdr_write(out, hdr); + if (show_headers) { + if (bcf_hdr_write(out, hdr) != 0) { + status = EXIT_FAILURE; + goto clean; + } + } if (mode == view_all) { bcf1_t *rec = bcf_init(); - while (bcf_read(in, hdr, rec) >= 0) - bcf_write(out, hdr, rec); + while (bcf_read(in, hdr, rec) >= 0) { + if (bcf_write(out, hdr, rec) < 0) { + status = EXIT_FAILURE; + goto clean; + } + } bcf_destroy(rec); } - bcf_hdr_destroy(hdr); - hts_close(out); + clean: + if (hdr != NULL) bcf_hdr_destroy(hdr); + if (out != NULL) hts_close(out); hts_close(in); return 1; } @@ -109,8 +165,9 @@ int main(int argc, char **argv) { NULL, 0, NULL, 0 } }; - int status = EXIT_SUCCESS; int c, i; + + status = EXIT_SUCCESS; while ((c = getopt_long(argc, argv, "chH?", options, NULL)) >= 0) switch (c) { case 'c': mode = view_all; break; @@ -129,6 +186,8 @@ int main(int argc, char **argv) if (optind == argc) usage(stderr, EXIT_FAILURE); + if (mode != identify) hts_verbose = 2; /* So we get some error messages */ + for (i = optind; i < argc; i++) { htsFormat fmt; hFILE *fp = hopen(argv[i], "r"); @@ -152,8 +211,12 @@ int main(int argc, char **argv) } else switch (fmt.category) { - case sequence_data: if (view_sam(fp, argv[i])) fp = NULL; break; - case variant_data: if (view_vcf(fp, argv[i])) fp = NULL; break; + case sequence_data: + if (view_sam(fp, argv[i])) fp = NULL; + break; + case variant_data: + if (view_vcf(fp, argv[i])) fp = NULL; + break; default: fprintf(stderr, "htsfile: can't view %s: unknown format\n", argv[i]); status = EXIT_FAILURE; diff --git a/lib/htslib/htslib/hts.h b/lib/htslib/htslib/hts.h index 46bdca1..69e9688 100644 --- a/lib/htslib/htslib/hts.h +++ b/lib/htslib/htslib/hts.h @@ -103,7 +103,7 @@ typedef struct htsFormat { struct { short major, minor; } version; enum htsCompression compression; short compression_level; // currently unused - void *specific; // currently unused + void *specific; // format specific options; see struct hts_opt. } htsFormat; // Maintainers note htsFile cannot be an opaque structure because some of its @@ -144,32 +144,97 @@ enum sam_fields { SAM_RGAUX = 0x00001000, }; -enum cram_option { +// Mostly CRAM only, but this could also include other format options +enum hts_fmt_option { + // CRAM specific CRAM_OPT_DECODE_MD, CRAM_OPT_PREFIX, - CRAM_OPT_VERBOSITY, + CRAM_OPT_VERBOSITY, // make general CRAM_OPT_SEQS_PER_SLICE, CRAM_OPT_SLICES_PER_CONTAINER, CRAM_OPT_RANGE, - CRAM_OPT_VERSION, + CRAM_OPT_VERSION, // rename to cram_version? CRAM_OPT_EMBED_REF, CRAM_OPT_IGNORE_MD5, - CRAM_OPT_REFERENCE, + CRAM_OPT_REFERENCE, // make general CRAM_OPT_MULTI_SEQ_PER_SLICE, CRAM_OPT_NO_REF, CRAM_OPT_USE_BZIP2, CRAM_OPT_SHARED_REF, - CRAM_OPT_NTHREADS, - CRAM_OPT_THREAD_POOL, + CRAM_OPT_NTHREADS, // deprecated, use HTS_OPT_NTHREADS + CRAM_OPT_THREAD_POOL,// make general CRAM_OPT_USE_LZMA, CRAM_OPT_USE_RANS, CRAM_OPT_REQUIRED_FIELDS, + + // General purpose + HTS_OPT_COMPRESSION_LEVEL = 100, + HTS_OPT_NTHREADS, }; +// For backwards compatibility +#define cram_option hts_fmt_option + +typedef struct hts_opt { + char *arg; // string form, strdup()ed + enum hts_fmt_option opt; // tokenised key + union { // ... and value + int i; + char *s; + } val; + struct hts_opt *next; +} hts_opt; + +#define HTS_FILE_OPTS_INIT {{0},0} + /********************** * Exported functions * **********************/ +/* + * Parses arg and appends it to the option list. + * + * Returns 0 on success; + * -1 on failure. + */ +int hts_opt_add(hts_opt **opts, const char *c_arg); + +/* + * Applies an hts_opt option list to a given htsFile. + * + * Returns 0 on success + * -1 on failure + */ +int hts_opt_apply(htsFile *fp, hts_opt *opts); + +/* + * Frees an hts_opt list. + */ +void hts_opt_free(hts_opt *opts); + +/* + * Accepts a string file format (sam, bam, cram, vcf, bam) optionally + * followed by a comma separated list of key=value options and splits + * these up into the fields of htsFormat struct. + * + * Returns 0 on success + * -1 on failure. + */ +int hts_parse_format(htsFormat *opt, const char *str); + +/* + * Tokenise options as (key(=value)?,)*(key(=value)?)? + * NB: No provision for ',' appearing in the value! + * Add backslashing rules? + * + * This could be used as part of a general command line option parser or + * as a string concatenated onto the file open mode. + * + * Returns 0 on success + * -1 on failure. + */ +int hts_parse_opt_list(htsFormat *opt, const char *str); + extern int hts_verbose; /*! @abstract Table for converting a nucleotide character to 4-bit encoding. @@ -206,6 +271,7 @@ int hts_detect_format(struct hFILE *fp, htsFormat *fmt); /*! @abstract Get a human-readable description of the file format + @param fmt Format structure holding type, version, compression, etc. @return Description string, to be freed by the caller after use. */ char *hts_format_description(const htsFormat *format); @@ -236,6 +302,21 @@ char *hts_format_description(const htsFormat *format); */ htsFile *hts_open(const char *fn, const char *mode); +/*! + @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file + @param fn The file name or "-" for stdin/stdout + @param mode Mode matching /[rwa][bcuz0-9]+/ + @param fmt Optional format specific parameters + @discussion + See hts_open() for description of fn and mode. + Opts contains a format string (sam, bam, cram, vcf, bcf) which will, + if defined, override mode. Opts also contains a linked list of hts_opt + structures to apply to the open file handle. These can contain things + like pointers to the reference or information on compression levels, + block sizes, etc. +*/ +htsFile *hts_open_format(const char *fn, const char *mode, htsFormat *fmt); + /*! @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file @param fn The already-open file handle @@ -257,6 +338,13 @@ int hts_close(htsFile *fp); */ const htsFormat *hts_get_format(htsFile *fp); +/*! + @ abstract Returns a string containing the file format extension. + @ param format Format structure containing the file type. + @ return A string ("sam", "bam", etc) or "?" for unknown formats. + */ +const char *hts_format_file_extension(const htsFormat *format); + /*! @abstract Sets a specified CRAM option on the open file handle. @param fp The file handle open the open file. @@ -264,7 +352,7 @@ const htsFormat *hts_get_format(htsFile *fp); @param ... Optional arguments, dependent on the option used. @return 0 for success, or negative if an error occurred. */ -int hts_set_opt(htsFile *fp, enum cram_option opt, ...); +int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...); int hts_getline(htsFile *fp, int delimiter, kstring_t *str); char **hts_readlines(const char *fn, int *_n); diff --git a/lib/htslib/htslib/kstring.h b/lib/htslib/htslib/kstring.h index df2da3f..fd91bbe 100644 --- a/lib/htslib/htslib/kstring.h +++ b/lib/htslib/htslib/kstring.h @@ -82,6 +82,13 @@ extern "C" { * if sep is not changed. */ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); + /* kgetline() uses the supplied fgets()-like function to read a "\n"- + * or "\r\n"-terminated line from fp. The line read is appended to the + * kstring without its terminator and 0 is returned; EOF is returned at + * EOF or on error (determined by querying fp, as per fgets()). */ + typedef char *kgets_func(char *, int, void *); + int kgetline(kstring_t *s, kgets_func *fgets, void *fp); + #ifdef __cplusplus } #endif diff --git a/lib/htslib/htslib/sam.h b/lib/htslib/htslib/sam.h index 613d52d..4ded5eb 100644 --- a/lib/htslib/htslib/sam.h +++ b/lib/htslib/htslib/sam.h @@ -342,10 +342,18 @@ int sam_index_build2(const char *fn, const char *fnidx, int min_shift); ***************/ #define sam_open(fn, mode) (hts_open((fn), (mode))) + #define sam_open_format(fn, mode, opts) (hts_open_format((fn), (mode), (opts))) #define sam_close(fp) hts_close(fp) int sam_open_mode(char *mode, const char *fn, const char *format); + // A version of sam_open_mode that can handle ,key=value options. + // The format string is allocated and returned, to be freed by the caller. + // Prefix should be "r" or "w", + char *sam_open_mode_opts(const char *fn, + const char *mode, + const char *format); + typedef htsFile samFile; bam_hdr_t *sam_hdr_parse(int l_text, const char *text); bam_hdr_t *sam_hdr_read(samFile *fp); diff --git a/lib/htslib/kstring.c b/lib/htslib/kstring.c index 0128266..b84ee2c 100644 --- a/lib/htslib/kstring.c +++ b/lib/htslib/kstring.c @@ -130,6 +130,26 @@ int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) return n; } +int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp) +{ + size_t l0 = s->l; + + while (s->l == l0 || s->s[s->l-1] != '\n') { + if (s->m - s->l < 200) ks_resize(s, s->m + 200); + if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break; + s->l += strlen(s->s + s->l); + } + + if (s->l == l0) return EOF; + + if (s->l > l0 && s->s[s->l-1] == '\n') { + s->l--; + if (s->l > l0 && s->s[s->l-1] == '\r') s->l--; + } + s->s[s->l] = '\0'; + return 0; +} + /********************** * Boyer-Moore search * **********************/ diff --git a/lib/htslib/sam.c b/lib/htslib/sam.c index 9265c12..04fb758 100644 --- a/lib/htslib/sam.c +++ b/lib/htslib/sam.c @@ -113,7 +113,9 @@ bam_hdr_t *bam_hdr_read(BGZF *fp) bam_hdr_t *h; char buf[4]; int magic_len, has_EOF; - int32_t i = 1, name_len; + int32_t i, name_len, num_names = 0; + size_t bufsize; + ssize_t bytes; // check EOF has_EOF = bgzf_check_EOF(fp); if (has_EOF < 0) { @@ -127,26 +129,88 @@ bam_hdr_t *bam_hdr_read(BGZF *fp) return 0; } h = bam_hdr_init(); + if (!h) goto nomem; + // read plain text and the number of reference sequences - bgzf_read(fp, &h->l_text, 4); + bytes = bgzf_read(fp, &h->l_text, 4); + if (bytes != 4) goto read_err; if (fp->is_be) ed_swap_4p(&h->l_text); - h->text = (char*)malloc(h->l_text + 1); + + bufsize = ((size_t) h->l_text) + 1; + if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed + h->text = (char*)malloc(bufsize); + if (!h->text) goto nomem; h->text[h->l_text] = 0; // make sure it is NULL terminated - bgzf_read(fp, h->text, h->l_text); - bgzf_read(fp, &h->n_targets, 4); + bytes = bgzf_read(fp, h->text, h->l_text); + if (bytes != h->l_text) goto read_err; + + bytes = bgzf_read(fp, &h->n_targets, 4); + if (bytes != 4) goto read_err; if (fp->is_be) ed_swap_4p(&h->n_targets); + + if (h->n_targets < 0) goto invalid; + // read reference sequence names and lengths h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); + if (!h->target_name) goto nomem; h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); + if (!h->target_len) goto nomem; + for (i = 0; i != h->n_targets; ++i) { - bgzf_read(fp, &name_len, 4); + bytes = bgzf_read(fp, &name_len, 4); + if (bytes != 4) goto read_err; if (fp->is_be) ed_swap_4p(&name_len); - h->target_name[i] = (char*)calloc(name_len, 1); - bgzf_read(fp, h->target_name[i], name_len); - bgzf_read(fp, &h->target_len[i], 4); + if (name_len <= 0) goto invalid; + + h->target_name[i] = (char*)malloc(name_len); + if (!h->target_name[i]) goto nomem; + num_names++; + + bytes = bgzf_read(fp, h->target_name[i], name_len); + if (bytes != name_len) goto read_err; + + if (h->target_name[i][name_len - 1] != '\0') { + /* Fix missing NUL-termination. Is this being too nice? + We could alternatively bail out with an error. */ + char *new_name; + if (name_len == INT32_MAX) goto invalid; + new_name = realloc(h->target_name[i], name_len + 1); + if (new_name == NULL) goto nomem; + h->target_name[i] = new_name; + h->target_name[i][name_len] = '\0'; + } + + bytes = bgzf_read(fp, &h->target_len[i], 4); + if (bytes != 4) goto read_err; if (fp->is_be) ed_swap_4p(&h->target_len[i]); } return h; + + nomem: + if (hts_verbose >= 1) fprintf(stderr, "[E::%s] out of memory\n", __func__); + goto clean; + + read_err: + if (hts_verbose >= 1) { + if (bytes < 0) { + fprintf(stderr, "[E::%s] error reading BGZF stream\n", __func__); + } else { + fprintf(stderr, "[E::%s] truncated bam header\n", __func__); + } + } + goto clean; + + invalid: + if (hts_verbose >= 1) { + fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__); + } + + clean: + if (h != NULL) { + h->n_targets = num_names; // ensure we free only allocated target_names + bam_hdr_destroy(h); + } + return NULL; } int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) @@ -398,6 +462,7 @@ static hts_idx_t *bam_index(BGZF *fp, int min_shift) hts_idx_t *idx; bam_hdr_t *h; h = bam_hdr_read(fp); + if (h == NULL) return NULL; if (min_shift > 0) { int64_t max_len = 0, s; for (i = 0; i < h->n_targets; ++i) @@ -1240,6 +1305,67 @@ int sam_open_mode(char *mode, const char *fn, const char *format) return 0; } +// A version of sam_open_mode that can handle ,key=value options. +// The format string is allocated and returned, to be freed by the caller. +// Prefix should be "r" or "w", +char *sam_open_mode_opts(const char *fn, + const char *mode, + const char *format) +{ + char *mode_opts = malloc((format ? strlen(format) : 1) + + (mode ? strlen(mode) : 1) + 12); + char *opts, *cp; + int format_len; + + if (!mode_opts) + return NULL; + + strcpy(mode_opts, mode ? mode : "r"); + cp = mode_opts + strlen(mode_opts); + + if (format == NULL) { + // Try to pick a format based on the filename extension + const char *ext = fn? strrchr(fn, '.') : NULL; + if (ext == NULL || strchr(ext, '/')) { + free(mode_opts); + return NULL; + } + return sam_open_mode(cp, fn, ext+1) + ? (free(mode_opts), NULL) + : mode_opts; + } + + if ((opts = strchr(format, ','))) { + format_len = opts-format; + } else { + opts=""; + format_len = strlen(format); + } + + if (strncmp(format, "bam", format_len) == 0) { + *cp++ = 'b'; + } else if (strncmp(format, "cram", format_len) == 0) { + *cp++ = 'c'; + } else if (strncmp(format, "cram2", format_len) == 0) { + *cp++ = 'c'; + strcpy(cp, ",VERSION=2.1"); + cp += 12; + } else if (strncmp(format, "cram3", format_len) == 0) { + *cp++ = 'c'; + strcpy(cp, ",VERSION=3.0"); + cp += 12; + } else if (strncmp(format, "sam", format_len) == 0) { + ; // format mode="" + } else { + free(mode_opts); + return NULL; + } + + strcpy(cp, opts); + + return mode_opts; +} + #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) int bam_str2flag(const char *str) { diff --git a/lib/htslib/test/test_view.c b/lib/htslib/test/test_view.c index c0f8469..30e9e7e 100644 --- a/lib/htslib/test/test_view.c +++ b/lib/htslib/test/test_view.c @@ -32,82 +32,6 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" -typedef struct hts_opt { - enum cram_option opt; - union { - int i; - char *s; - } val; - struct hts_opt *next; -} hts_opt; - -/* - * Parses arg and appends it to the option list. - * Returns 0 on success; - * -1 on failure. - */ -int add_option(hts_opt **opts, char *arg) { - hts_opt *o, *t; - char *cp; - - if (!(cp = strchr(arg, '='))) - cp = "1"; // assume boolean - else - *cp++ = 0; - - if (!(o = malloc(sizeof(*o)))) - return -1; - - if (strcmp(arg, "DECODE_MD") == 0) - o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(cp); - else if (strcmp(arg, "VERBOSITY") == 0) - o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(cp); - else if (strcmp(arg, "SEQS_PER_SLICE") == 0) - o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(cp); - else if (strcmp(arg, "SLICES_PER_CONTAINER") == 0) - o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(cp); - else if (strcmp(arg, "EMBED_REF") == 0) - o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(cp); - else if (strcmp(arg, "NO_REF") == 0) - o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(cp); - else if (strcmp(arg, "IGNORE_MD5") == 0) - o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(cp); - else if (strcmp(arg, "USE_BZIP2") == 0) - o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(cp); - else if (strcmp(arg, "USE_RANS") == 0) - o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(cp); - else if (strcmp(arg, "USE_LZMA") == 0) - o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(cp); - else if (strcmp(arg, "REFERENCE") == 0) - o->opt = CRAM_OPT_REFERENCE, o->val.s = cp; - else if (strcmp(arg, "VERSION") == 0) - o->opt = CRAM_OPT_VERSION, o->val.s =cp; - else if (strcmp(arg, "MULTI_SEQ_PER_SLICE") == 0) - o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(cp); - else if (strcmp(arg, "NTHREADS") == 0) - o->opt = CRAM_OPT_NTHREADS, o->val.i = atoi(cp); - else if (strcmp(arg, "REQUIRED_FIELDS") == 0) - o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(cp, NULL, 0); - else { - fprintf(stderr, "Unknown option '%s'\n", arg); - free(o); - return -1; - } - - o->next = NULL; - - if (*opts) { - t = *opts; - while (t->next) - t = t->next; - t->next = o; - } else { - *opts = o; - } - - return 0; -} - int main(int argc, char *argv[]) { samFile *in; @@ -117,9 +41,9 @@ int main(int argc, char *argv[]) bam_hdr_t *h; bam1_t *b; htsFile *out; - char modew[8]; + char modew[800]; int r = 0, exit_code = 0; - hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL; + hts_opt *in_opts = NULL, *out_opts = NULL; int nreads = 0; int benchmark = 0; @@ -133,9 +57,9 @@ int main(int argc, char *argv[]) case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; - case 'i': if (add_option(&in_opts, optarg)) return 1; break; - case 'o': if (add_option(&out_opts, optarg)) return 1; break; - case 'N': nreads = atoi(optarg); + case 'i': if (hts_opt_add(&in_opts, optarg)) return 1; break; + case 'o': if (hts_opt_add(&out_opts, optarg)) return 1; break; + case 'N': nreads = atoi(optarg); break; } } if (argc == optind) { @@ -152,6 +76,10 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } h = sam_hdr_read(in); + if (h == NULL) { + fprintf(stderr, "Couldn't read header for \"%s\"\n", argv[optind]); + return EXIT_FAILURE; + } h->ignore_sam_err = ignore_sam_err; b = bam_init1(); @@ -184,15 +112,13 @@ int main(int argc, char *argv[]) } // Process any options; currently cram only. - for (; in_opts; in_opts = (last=in_opts)->next, free(last)) { - hts_set_opt(in, in_opts->opt, in_opts->val); - if (in_opts->opt == CRAM_OPT_REFERENCE) - if (hts_set_opt(out, in_opts->opt, in_opts->val) != 0) - return EXIT_FAILURE; - } - for (; out_opts; out_opts = (last=out_opts)->next, free(last)) - if (hts_set_opt(out, out_opts->opt, out_opts->val) != 0) - return EXIT_FAILURE; + if (hts_opt_apply(in, in_opts)) + return EXIT_FAILURE; + hts_opt_free(in_opts); + + if (hts_opt_apply(out, out_opts)) + return EXIT_FAILURE; + hts_opt_free(out_opts); if (!benchmark) sam_hdr_write(out, h); diff --git a/lib/htslib/vcf.c b/lib/htslib/vcf.c index 047c7b5..ae7e34f 100644 --- a/lib/htslib/vcf.c +++ b/lib/htslib/vcf.c @@ -2631,7 +2631,7 @@ static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *va } const char *r = ref, *a = alt; - while (*r && *a && *r==*a ) { r++; a++; } + while (*r && *a && toupper(*r)==toupper(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed if ( *a && !*r ) { @@ -2651,18 +2651,18 @@ static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *va const char *re = r, *ae = a; while ( re[1] ) re++; while ( ae[1] ) ae++; - while ( *re==*ae && re>r && ae>a ) { re--; ae--; } + while ( re>r && ae>a && toupper(*re)==toupper(*ae) ) { re--; ae--; } if ( ae==a ) { if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; } var->n = -(re-r); - if ( *re==*ae ) { var->type = VCF_INDEL; return; } + if ( toupper(*re)==toupper(*ae) ) { var->type = VCF_INDEL; return; } var->type = VCF_OTHER; return; } else if ( re==r ) { var->n = ae-a; - if ( *re==*ae ) { var->type = VCF_INDEL; return; } + if ( toupper(*re)==toupper(*ae) ) { var->type = VCF_INDEL; return; } var->type = VCF_OTHER; return; } diff --git a/test/decompose_blocksub/01_OUT_even_length.stderr b/test/decompose_blocksub/01_OUT_even_length.stderr index 772925e..728e1a1 100644 --- a/test/decompose_blocksub/01_OUT_even_length.stderr +++ b/test/decompose_blocksub/01_OUT_even_length.stderr @@ -2,6 +2,7 @@ decompose_blocksub v0.5 options: input VCF file [o] output VCF file + [a] align/aggressive mode false stats: no. variants : 1