Skip to content

Commit

Permalink
Release 1.7: improved filtering; many other upgrades
Browse files Browse the repository at this point in the history
  • Loading branch information
daviesrob committed Feb 12, 2018
2 parents 7b813be + 69c3317 commit 33b14f1
Show file tree
Hide file tree
Showing 148 changed files with 3,546 additions and 1,068 deletions.
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ endif

include config.mk

PACKAGE_VERSION = 1.6
PACKAGE_VERSION = 1.7

# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
Expand Down Expand Up @@ -148,7 +148,7 @@ PLATFORM := $(shell uname -s)
endif
ifeq "$(PLATFORM)" "Darwin"
$(PLUGINS): | bcftools
PLUGIN_FLAGS = -bundle -bundle_loader bcftools
PLUGIN_FLAGS = -bundle -bundle_loader bcftools -Wl,-undefined,dynamic_lookup
else
PLUGIN_FLAGS = -fPIC -shared
endif
Expand Down Expand Up @@ -272,23 +272,23 @@ docs: doc/bcftools.1 doc/bcftools.html
# bcftools.1 is a generated file from the asciidoc bcftools.txt file.
# Since there is no make dependency, bcftools.1 can be out-of-date and
# make docs can be run to update if asciidoc is available
install: $(PROG) $(PLUGINS)
install: $(PROGRAMS) $(PLUGINS)
$(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(man1dir) $(DESTDIR)$(plugindir)
$(INSTALL_PROGRAM) $(PROGRAMS) $(DESTDIR)$(bindir)
$(INSTALL_SCRIPT) $(MISC_SCRIPTS) $(DESTDIR)$(misc_bindir)
$(INSTALL_MAN) doc/bcftools.1 $(DESTDIR)$(man1dir)
$(INSTALL_PROGRAM) plugins/*.so $(DESTDIR)$(plugindir)

clean: testclean clean-plugins
-rm -f gmon.out *.o *~ $(PROG) version.h plugins/*.so plugins/*.P
-rm -f gmon.out *.o *~ $(PROGRAMS) version.h plugins/*.so plugins/*.P
-rm -rf *.dSYM plugins/*.dSYM test/*.dSYM

clean-plugins:
-rm -f plugins/*.so plugins/*.P
-rm -rf plugins/*.dSYM

testclean:
-rm -f test/*.o test/*~ $(TEST_PROG)
-rm -f test/*.o test/*~ $(TEST_PROGRAMS)

distclean: clean
-rm -f config.cache config.h config.log config.mk config.status
Expand Down
37 changes: 37 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,40 @@
## Release 1.7 (February 2018)

* `-i, -e` filtering: Major revamp, improved filtering by FORMAT fields
and missing values. New GT=ref,alt,mis etc keywords, check the documenation
for details.

* `query`: Only matching expression are printed when both the -f and -i/-e
expressions contain genotype fields. Note that this changes the original
behavior. Previously all samples were output when one matching sample was
found. This functionality can be achieved by pre-filtering with view and then
streaming to query. Compare
bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]' -i'GT="alt"' file.bcf
and
bcftools view -i'GT="alt"' file.bcf -Ou | bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]'

* `annotate`: New -k, --keep-sites option

* `consensus`: Fix --iupac-codes output

* `csq`: Homs always considered phased and other fixes

* `norm`: Make `-c none` work and remove `query -c`

* `roh`: Fix errors in the RG output

* `stats`: Allow IUPAC ambiguity codes in the reference file; report the number of missing genotypes

* `+fill-tags`: Add ExcHet annotation

* `+setGt`: Fix bug in binom.test calculation, previously it worked only for nAlt<nRef!

* `+split`: New plugin to split a multi-sample file into single-sample files in one go

* Improve python3 compatibility in plotting scripts



## Release 1.6 (September 2017)

* New `sort` command.
Expand Down
17 changes: 17 additions & 0 deletions bcftools.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,23 @@ static inline char gt2iupac(char a, char b)
return iupac[(int)a][(int)b];
}

static inline int iupac_consistent(char iupac, char nt)
{
static const char iupac_mask[90] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,14,2,
13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10
};
if ( iupac > 89 ) return 0;
if ( nt > 90 ) nt -= 32; // lowercase
if ( nt=='A' ) nt = 1;
else if ( nt=='C' ) nt = 2;
else if ( nt=='G' ) nt = 4;
else if ( nt=='T' ) nt = 8;
return iupac_mask[(int)iupac] & nt ? 1 : 0;
}

static inline char nt_to_upper(char nt)
{
if ( nt < 97 ) return nt;
Expand Down
13 changes: 9 additions & 4 deletions consensus.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ typedef struct
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
int nvcf_buf, rid;
char *chr;

regidx_t *mask;
regitr_t *itr;
Expand Down Expand Up @@ -121,6 +122,8 @@ static void destroy_chain(args_t *args)
free(chain->block_lengths);
free(chain);
chain = NULL;
free(args->chr);
args->chr = NULL;
}

static void print_chain(args_t *args)
Expand Down Expand Up @@ -162,7 +165,7 @@ static void print_chain(args_t *args)
score += chain->block_lengths[n];
}
score += last_block_size;
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
for (n=0; n<chain->num; n++) {
fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
}
Expand Down Expand Up @@ -248,6 +251,7 @@ static void destroy_data(args_t *args)
if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
free(args->vcf_buf);
free(args->fa_buf.s);
free(args->chr);
if ( args->mask ) regidx_destroy(args->mask);
if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
Expand Down Expand Up @@ -276,6 +280,7 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
Expand Down Expand Up @@ -380,7 +385,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
}

int i, ialt = 1;
int i, ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
Expand Down Expand Up @@ -417,6 +422,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
Expand Down Expand Up @@ -565,11 +571,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)

static void mask_region(args_t *args, char *seq, int len)
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;

if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;

int idx_start, idx_end, i;
while ( regitr_overlap(args->itr) )
Expand Down
30 changes: 29 additions & 1 deletion convert.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* convert.c -- functions for converting between VCF/BCF and related formats.
Copyright (C) 2013-2017 Genome Research Ltd.
Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -92,6 +92,7 @@ struct _convert_t
int ndat;
char *undef_info_tag;
int allow_undef_tags;
uint8_t **subset_samples;
};

typedef struct
Expand Down Expand Up @@ -174,6 +175,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
}
return ((int32_t*)bcf_array)[idx];
}
static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
{
int n = 0, ibeg = 0;
while ( src[ibeg] && ibeg<len && n < idx )
{
if ( src[ibeg]==',' ) n++;
ibeg++;
}
if ( ibeg==len ) { kputc('.', str); return; }

int iend = ibeg;
while ( src[iend] && src[iend]!=',' && iend<len ) iend++;

if ( iend>ibeg )
kputsn(src+ibeg, iend-ibeg, str);
else
kputc('.', str);
}
static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( fmt->id<0 )
Expand Down Expand Up @@ -232,6 +251,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
Expand Down Expand Up @@ -288,6 +308,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
else
kputw(ival, str);
}
else if ( fmt->fmt->type == BCF_BT_CHAR )
_copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
}
else
Expand Down Expand Up @@ -1312,6 +1334,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
}
for (js=0; js<convert->nsamples; js++)
{
// Skip samples when filtering was requested
if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;

// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// it's very very slow. Therefore in case the handler does not add
Expand Down Expand Up @@ -1362,6 +1387,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
break;
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
default:
ret = -1;
}
Expand Down
3 changes: 2 additions & 1 deletion convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ THE SOFTWARE. */
typedef struct _convert_t convert_t;
enum convert_option
{
allow_undef_tags
allow_undef_tags,
subset_samples,
};

convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
Expand Down
19 changes: 12 additions & 7 deletions csq.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2016 Genome Research Ltd.
Copyright (c) 2016-2018 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -726,7 +726,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char
id = tbl->nstr++;
hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
tbl->str[id] = strdup(ss);
int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;

Expand Down Expand Up @@ -2713,6 +2713,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
}
}


void hap_finalize(args_t *args, hap_t *hap)
{
tscript_t *tr = hap->tr;
Expand Down Expand Up @@ -2784,7 +2785,10 @@ void hap_finalize(args_t *args, hap_t *hap)
}
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
if ( i<istack )

// This condition extends compound variants. Note that s/s/s sites are forced out to always break
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
{
if ( dlen%3 ) // frameshift
{
Expand Down Expand Up @@ -3388,7 +3392,7 @@ int test_cds(args_t *args, bcf1_t *rec)
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
if ( gt[0]==bcf_gt_missing ) continue;

if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
{
if ( args->phase==PHASE_MERGE )
{
Expand All @@ -3397,7 +3401,7 @@ int test_cds(args_t *args, bcf1_t *rec)
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
Expand Down Expand Up @@ -3648,6 +3652,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
int call_csq = 1;
if ( !rec->n_allele ) call_csq = 0; // no alternate allele
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc
else if ( args->filter )
{
call_csq = filter_test(args->filter, rec, NULL);
Expand Down Expand Up @@ -3695,12 +3700,12 @@ static const char *usage(void)
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
" -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
" -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased GTs\n"
" s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" -i, --include <expr> select sites for which the expression is true\n"
Expand Down
Loading

0 comments on commit 33b14f1

Please sign in to comment.