-
Notifications
You must be signed in to change notification settings - Fork 242
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Release 1.7: improved filtering; many other upgrades
- Loading branch information
Showing
148 changed files
with
3,546 additions
and
1,068 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
/* convert.c -- functions for converting between VCF/BCF and related formats. | ||
Copyright (C) 2013-2017 Genome Research Ltd. | ||
Copyright (C) 2013-2018 Genome Research Ltd. | ||
Author: Petr Danecek <[email protected]> | ||
|
@@ -92,6 +92,7 @@ struct _convert_t | |
int ndat; | ||
char *undef_info_tag; | ||
int allow_undef_tags; | ||
uint8_t **subset_samples; | ||
}; | ||
|
||
typedef struct | ||
|
@@ -174,6 +175,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx) | |
} | ||
return ((int32_t*)bcf_array)[idx]; | ||
} | ||
static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str) | ||
{ | ||
int n = 0, ibeg = 0; | ||
while ( src[ibeg] && ibeg<len && n < idx ) | ||
{ | ||
if ( src[ibeg]==',' ) n++; | ||
ibeg++; | ||
} | ||
if ( ibeg==len ) { kputc('.', str); return; } | ||
|
||
int iend = ibeg; | ||
while ( src[iend] && src[iend]!=',' && iend<len ) iend++; | ||
|
||
if ( iend>ibeg ) | ||
kputsn(src+ibeg, iend-ibeg, str); | ||
else | ||
kputc('.', str); | ||
} | ||
static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) | ||
{ | ||
if ( fmt->id<0 ) | ||
|
@@ -232,6 +251,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp | |
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break; | ||
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; | ||
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break; | ||
case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break; | ||
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break; | ||
} | ||
#undef BRANCH | ||
|
@@ -288,6 +308,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa | |
else | ||
kputw(ival, str); | ||
} | ||
else if ( fmt->fmt->type == BCF_BT_CHAR ) | ||
_copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str); | ||
else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type); | ||
} | ||
else | ||
|
@@ -1312,6 +1334,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) | |
} | ||
for (js=0; js<convert->nsamples; js++) | ||
{ | ||
// Skip samples when filtering was requested | ||
if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; | ||
|
||
// Here comes a hack designed for TBCSQ. When running on large files, | ||
// such as 1000GP, there are too many empty fields in the output and | ||
// it's very very slow. Therefore in case the handler does not add | ||
|
@@ -1362,6 +1387,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) | |
case allow_undef_tags: | ||
convert->allow_undef_tags = va_arg(args, int); | ||
break; | ||
case subset_samples: | ||
convert->subset_samples = va_arg(args, uint8_t**); | ||
break; | ||
default: | ||
ret = -1; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
/* The MIT License | ||
Copyright (c) 2016 Genome Research Ltd. | ||
Copyright (c) 2016-2018 Genome Research Ltd. | ||
Author: Petr Danecek <[email protected]> | ||
|
@@ -726,7 +726,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char | |
id = tbl->nstr++; | ||
hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); | ||
tbl->str[id] = strdup(ss); | ||
int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id); | ||
khash_str2int_set(tbl->str2id, tbl->str[id], id); | ||
} | ||
*se = tmp; | ||
|
||
|
@@ -2713,6 +2713,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, | |
} | ||
} | ||
|
||
|
||
void hap_finalize(args_t *args, hap_t *hap) | ||
{ | ||
tscript_t *tr = hap->tr; | ||
|
@@ -2784,7 +2785,10 @@ void hap_finalize(args_t *args, hap_t *hap) | |
} | ||
dlen += hap->stack[i].node->dlen; | ||
if ( hap->stack[i].node->dlen ) indel = 1; | ||
if ( i<istack ) | ||
|
||
// This condition extends compound variants. Note that s/s/s sites are forced out to always break | ||
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation. | ||
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS ) | ||
{ | ||
if ( dlen%3 ) // frameshift | ||
{ | ||
|
@@ -3388,7 +3392,7 @@ int test_cds(args_t *args, bcf1_t *rec) | |
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts; | ||
if ( gt[0]==bcf_gt_missing ) continue; | ||
|
||
if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end ) | ||
if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) ) | ||
{ | ||
if ( args->phase==PHASE_MERGE ) | ||
{ | ||
|
@@ -3397,7 +3401,7 @@ int test_cds(args_t *args, bcf1_t *rec) | |
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) | ||
{ | ||
if ( args->phase==PHASE_REQUIRE ) | ||
error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); | ||
error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); | ||
if ( args->phase==PHASE_SKIP ) | ||
continue; | ||
if ( args->phase==PHASE_NON_REF ) | ||
|
@@ -3648,6 +3652,7 @@ void process(args_t *args, bcf1_t **rec_ptr) | |
int call_csq = 1; | ||
if ( !rec->n_allele ) call_csq = 0; // no alternate allele | ||
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele | ||
else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc | ||
else if ( args->filter ) | ||
{ | ||
call_csq = filter_test(args->filter, rec, NULL); | ||
|
@@ -3695,12 +3700,12 @@ static const char *usage(void) | |
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n" | ||
" -l, --local-csq localized predictions, consider only one VCF record at a time\n" | ||
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n" | ||
" -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n" | ||
" -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n" | ||
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" | ||
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" | ||
" r: require phased GTs, throw an error on unphased het GTs\n" | ||
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" | ||
" s: skip unphased GTs\n" | ||
" s: skip unphased hets\n" | ||
"Options:\n" | ||
" -e, --exclude <expr> exclude sites for which the expression is true\n" | ||
" -i, --include <expr> select sites for which the expression is true\n" | ||
|
Oops, something went wrong.