Release 1.7: improved filtering; many other upgrades

samtools · Feb 12, 2018 · 33b14f1 · 33b14f1
2 parents 7b813be + 69c3317
commit 33b14f1
Show file tree

Hide file tree

Showing 148 changed files with 3,546 additions and 1,068 deletions.
diff --git a/Makefile b/Makefile
@@ -93,7 +93,7 @@ endif
 
 include config.mk
 
-PACKAGE_VERSION = 1.6
+PACKAGE_VERSION = 1.7
 
 # If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
 # description of the working tree: either a release tag with the same value
@@ -148,7 +148,7 @@ PLATFORM := $(shell uname -s)
 endif
 ifeq "$(PLATFORM)" "Darwin"
 $(PLUGINS): | bcftools
-PLUGIN_FLAGS = -bundle -bundle_loader bcftools
+PLUGIN_FLAGS = -bundle -bundle_loader bcftools -Wl,-undefined,dynamic_lookup
 else
 PLUGIN_FLAGS = -fPIC -shared
 endif
@@ -272,23 +272,23 @@ docs: doc/bcftools.1 doc/bcftools.html
 # bcftools.1 is a generated file from the asciidoc bcftools.txt file.
 # Since there is no make dependency, bcftools.1 can be out-of-date and
 # make docs can be run to update if asciidoc is available
-install: $(PROG) $(PLUGINS)
+install: $(PROGRAMS) $(PLUGINS)
 	$(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(man1dir) $(DESTDIR)$(plugindir)
 	$(INSTALL_PROGRAM) $(PROGRAMS) $(DESTDIR)$(bindir)
 	$(INSTALL_SCRIPT) $(MISC_SCRIPTS) $(DESTDIR)$(misc_bindir)
 	$(INSTALL_MAN) doc/bcftools.1 $(DESTDIR)$(man1dir)
 	$(INSTALL_PROGRAM) plugins/*.so $(DESTDIR)$(plugindir)
 
 clean: testclean clean-plugins
-	-rm -f gmon.out *.o *~ $(PROG) version.h plugins/*.so plugins/*.P
+	-rm -f gmon.out *.o *~ $(PROGRAMS) version.h plugins/*.so plugins/*.P
 	-rm -rf *.dSYM plugins/*.dSYM test/*.dSYM
 
 clean-plugins:
 	-rm -f plugins/*.so plugins/*.P
 	-rm -rf plugins/*.dSYM
 
 testclean:
-	-rm -f test/*.o test/*~ $(TEST_PROG)
+	-rm -f test/*.o test/*~ $(TEST_PROGRAMS)
 
 distclean: clean
 	-rm -f config.cache config.h config.log config.mk config.status

diff --git a/NEWS b/NEWS
@@ -1,3 +1,40 @@
+## Release 1.7 (February 2018)
+
+* `-i, -e` filtering: Major revamp, improved filtering by FORMAT fields
+  and missing values. New GT=ref,alt,mis etc keywords, check the documenation
+  for details.
+
+* `query`: Only matching expression are printed when both the -f and -i/-e
+  expressions contain genotype fields. Note that this changes the original
+  behavior. Previously all samples were output when one matching sample was
+  found. This functionality can be achieved by pre-filtering with view and then
+  streaming to query. Compare
+        bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]' -i'GT="alt"' file.bcf
+  and
+        bcftools view -i'GT="alt"' file.bcf -Ou | bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]'
+
+* `annotate`: New -k, --keep-sites option
+
+* `consensus`: Fix --iupac-codes output
+
+* `csq`: Homs always considered phased and other fixes
+
+* `norm`: Make `-c none` work and remove `query -c`
+
+* `roh`: Fix errors in the RG output
+
+* `stats`: Allow IUPAC ambiguity codes in the reference file; report the number of missing genotypes
+
+* `+fill-tags`: Add ExcHet annotation
+
+* `+setGt`: Fix bug in binom.test calculation, previously it worked only for nAlt<nRef!
+
+* `+split`: New plugin to split a multi-sample file into single-sample files in one go
+
+* Improve python3 compatibility in plotting scripts
+
+
+
 ## Release 1.6 (September 2017)
 
 * New `sort` command.

diff --git a/bcftools.h b/bcftools.h
@@ -63,6 +63,23 @@ static inline char gt2iupac(char a, char b)
     return iupac[(int)a][(int)b];
 }
 
+static inline int iupac_consistent(char iupac, char nt)
+{
+    static const char iupac_mask[90] = {
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,14,2,
+        13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10
+    };
+    if ( iupac > 89 ) return 0;
+    if ( nt > 90 ) nt -=  32;  // lowercase 
+    if ( nt=='A' ) nt = 1;
+    else if ( nt=='C' ) nt = 2;
+    else if ( nt=='G' ) nt = 4;
+    else if ( nt=='T' ) nt = 8;
+    return iupac_mask[(int)iupac] & nt ? 1 : 0;
+}
+
 static inline char nt_to_upper(char nt)
 {
     if ( nt < 97 ) return nt;

diff --git a/consensus.c b/consensus.c
@@ -77,6 +77,7 @@ typedef struct
     rbuf_t vcf_rbuf;
     bcf1_t **vcf_buf;
     int nvcf_buf, rid;
+    char *chr;
 
     regidx_t *mask;
     regitr_t *itr;
@@ -121,6 +122,8 @@ static void destroy_chain(args_t *args)
     free(chain->block_lengths);
     free(chain);
     chain = NULL;
+    free(args->chr);
+    args->chr = NULL;
 }
 
 static void print_chain(args_t *args)
@@ -162,7 +165,7 @@ static void print_chain(args_t *args)
         score += chain->block_lengths[n];
     }
     score += last_block_size;
-    fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
+    fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
     for (n=0; n<chain->num; n++) {
         fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
     }
@@ -248,6 +251,7 @@ static void destroy_data(args_t *args)
         if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
     free(args->vcf_buf);
     free(args->fa_buf.s);
+    free(args->chr);
     if ( args->mask ) regidx_destroy(args->mask);
     if ( args->itr ) regitr_destroy(args->itr);
     if ( args->chain_fname )
@@ -276,6 +280,7 @@ static void init_region(args_t *args, char *line)
             else to--;
         }
     }
+    args->chr = strdup(line);
     args->rid = bcf_hdr_name2id(args->hdr,line);
     if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
     args->fa_buf.l = 0;
@@ -380,7 +385,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
     }
 
-    int i, ialt = 1;
+    int i, ialt = 1;    // the alternate allele
     if ( args->isample >= 0 )
     {
         bcf_unpack(rec, BCF_UN_FMT);
@@ -417,6 +422,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             {
                 char ial = rec->d.allele[ialt][0];
                 char jal = rec->d.allele[jalt][0];
+                if ( !ialt ) ialt = jalt;   // only ialt is used, make sure 0/1 is not ignored
                 rec->d.allele[ialt][0] = gt2iupac(ial,jal);
             }
         }
@@ -565,11 +571,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
 
 static void mask_region(args_t *args, char *seq, int len)
 {
-    char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
     int start = args->fa_src_pos - len;
     int end   = args->fa_src_pos;
 
-    if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
+    if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
 
     int idx_start, idx_end, i;
     while ( regitr_overlap(args->itr) )

diff --git a/convert.c b/convert.c
@@ -1,6 +1,6 @@
 /*  convert.c -- functions for converting between VCF/BCF and related formats.
 
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
 
     Author: Petr Danecek <[email protected]>
 
@@ -92,6 +92,7 @@ struct _convert_t
     int ndat;
     char *undef_info_tag;
     int allow_undef_tags;
+    uint8_t **subset_samples;
 };
 
 typedef struct
@@ -174,6 +175,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
     }
     return ((int32_t*)bcf_array)[idx];
 }
+static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
+{
+    int n = 0, ibeg = 0;
+    while ( src[ibeg] && ibeg<len && n < idx )
+    {
+        if ( src[ibeg]==',' ) n++;
+        ibeg++;
+    }
+    if ( ibeg==len ) { kputc('.', str); return; }
+
+    int iend = ibeg;
+    while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
+
+    if ( iend>ibeg )
+        kputsn(src+ibeg, iend-ibeg, str);
+    else
+        kputc('.', str);
+}
 static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
 {
     if ( fmt->id<0 )
@@ -232,6 +251,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
             case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
             case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
             case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
+            case BCF_BT_CHAR:  _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
             default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
         }
         #undef BRANCH
@@ -288,6 +308,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
             else
                 kputw(ival, str);
         }
+        else if ( fmt->fmt->type == BCF_BT_CHAR )
+            _copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
         else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
     }
     else
@@ -1312,6 +1334,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
             }
             for (js=0; js<convert->nsamples; js++)
             {
+                // Skip samples when filtering was requested
+                if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+
                 // Here comes a hack designed for TBCSQ. When running on large files,
                 // such as 1000GP, there are too many empty fields in the output and
                 // it's very very slow. Therefore in case the handler does not add
@@ -1362,6 +1387,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
         case allow_undef_tags:
             convert->allow_undef_tags = va_arg(args, int);
             break;
+        case subset_samples:
+            convert->subset_samples = va_arg(args, uint8_t**);
+            break;
         default:
             ret = -1;
     }

diff --git a/convert.h b/convert.h
@@ -30,7 +30,8 @@ THE SOFTWARE.  */
 typedef struct _convert_t convert_t;
 enum convert_option
 {
-    allow_undef_tags
+    allow_undef_tags,
+    subset_samples,
 };
 
 convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);

diff --git a/csq.c b/csq.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2016 Genome Research Ltd.
+   Copyright (c) 2016-2018 Genome Research Ltd.
 
    Author: Petr Danecek <[email protected]>
    
@@ -726,7 +726,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char
         id = tbl->nstr++;
         hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
         tbl->str[id] = strdup(ss);
-        int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+        khash_str2int_set(tbl->str2id, tbl->str[id], id);
     }
     *se = tmp;
 
@@ -2713,6 +2713,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
     }
 }
 
+
 void hap_finalize(args_t *args, hap_t *hap)
 {
     tscript_t *tr = hap->tr;
@@ -2784,7 +2785,10 @@ void hap_finalize(args_t *args, hap_t *hap)
                 }
                 dlen += hap->stack[i].node->dlen;
                 if ( hap->stack[i].node->dlen ) indel = 1;
-                if ( i<istack )
+
+                // This condition extends compound variants. Note that s/s/s sites are forced out to always break
+                // a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
+                if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
                 {
                     if ( dlen%3 )   // frameshift
                     {
@@ -3388,7 +3392,7 @@ int test_cds(args_t *args, bcf1_t *rec)
             int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
             if ( gt[0]==bcf_gt_missing ) continue;
 
-            if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+            if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
             {
                 if ( args->phase==PHASE_MERGE )
                 {
@@ -3397,7 +3401,7 @@ int test_cds(args_t *args, bcf1_t *rec)
                 if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
                 {
                     if ( args->phase==PHASE_REQUIRE )
-                        error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+                        error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
                     if ( args->phase==PHASE_SKIP )
                         continue;
                     if ( args->phase==PHASE_NON_REF )
@@ -3648,6 +3652,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
     int call_csq = 1;
     if ( !rec->n_allele ) call_csq = 0;   // no alternate allele
     else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0;     // gVCF, no alt allele
+    else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0;                            // a symbolic allele, not ready for CNVs etc
     else if ( args->filter )
     {
         call_csq = filter_test(args->filter, rec, NULL);
@@ -3695,12 +3700,12 @@ static const char *usage(void)
         "   -c, --custom-tag <string>       use this tag instead of the default BCSQ\n"
         "   -l, --local-csq                 localized predictions, consider only one VCF record at a time\n"
         "   -n, --ncsq <int>                maximum number of consequences to consider per site [16]\n"
-        "   -p, --phase <a|m|r|R|s>         how to construct haplotypes and how to deal with unphased data: [r]\n"
+        "   -p, --phase <a|m|r|R|s>         how to handle unphased heterozygous genotypes: [r]\n"
         "                                     a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
         "                                     m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
         "                                     r: require phased GTs, throw an error on unphased het GTs\n"
         "                                     R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
-        "                                     s: skip unphased GTs\n"
+        "                                     s: skip unphased hets\n"
         "Options:\n"
         "   -e, --exclude <expr>            exclude sites for which the expression is true\n"
         "   -i, --include <expr>            select sites for which the expression is true\n"