Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/str operators #92

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

<groupId>org.molgenis</groupId>
<artifactId>vip-decision-tree</artifactId>
<version>4.1.1</version>
<version>4.2.0</version>

<name>vip-decision-tree</name>
<description>Decision tree module for filtering and labelling VCF files</description>
Expand Down Expand Up @@ -43,7 +43,7 @@
<commons.cli.version>1.6.0</commons.cli.version>
<samtools.htsjdk.version>4.1.0</samtools.htsjdk.version>
<jacoco-maven-plugin.version>0.8.11</jacoco-maven-plugin.version>
<vip.utils.version>2.0.0</vip.utils.version>
<vip.utils.version>2.1.0</vip.utils.version>
</properties>

<profiles>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,64 @@
package org.molgenis.vcf.decisiontree.filter;

import static org.molgenis.vcf.decisiontree.filter.model.BoolNode.FIELD_PREFIX;
import static org.molgenis.vcf.decisiontree.filter.model.ValueType.FLOAT;
import static org.molgenis.vcf.decisiontree.filter.model.ValueType.INTEGER;

import java.util.Collection;
import org.molgenis.vcf.decisiontree.filter.model.BoolQuery;
import java.util.*;

import org.molgenis.vcf.decisiontree.filter.model.*;
import org.molgenis.vcf.decisiontree.filter.model.BoolQuery.Operator;
import org.molgenis.vcf.decisiontree.filter.model.DecisionNode;
import org.molgenis.vcf.decisiontree.filter.model.Field;
import org.molgenis.vcf.decisiontree.filter.model.SampleContext;
import org.molgenis.vcf.utils.UnexpectedEnumException;
import org.springframework.lang.Nullable;

interface BaseBoolNodeEvaluator<T extends DecisionNode> extends
NodeEvaluator<T> {

default boolean isMissingValue(Object value) {
return value == null || (value instanceof Collection<?> && ((Collection<?>) value).isEmpty());
}

default boolean executeQuery(BoolQuery boolQuery, Object value) {
boolean matches;

MultiMode multiMode = boolQuery.getMultiMode();
Field field = boolQuery.getField();
Operator operator = boolQuery.getOperator();
Object queryValue = boolQuery.getValue();

switch (multiMode) {
case SINGLE -> {
return executeSingleQuery(value, field, operator, queryValue);
}
case ANY -> {
for(Object singleValue : ((Collection<?>) value)) {
if(executeSingleQuery(singleValue, field, operator, queryValue)){
return true;
}
}
return false;
}
case ALL -> {
for(Object singleValue : ((Collection<?>) value)) {
if(!executeSingleQuery(singleValue, field, operator, queryValue)){
return false;
}
}
return true;
}
default -> throw new UnexpectedEnumException(multiMode);
}
}

private boolean executeSingleQuery(Object value, Field field, Operator operator, Object queryValue) {
boolean matches;
switch (operator) {
case EQUALS:
matches = value.equals(queryValue);
break;
case EQUALS_SEQUENCE:
matches = executeSequenceEqualsQuery(field, value, queryValue);
break;
case NOT_EQUALS_SEQUENCE:
matches = !executeSequenceEqualsQuery(field, value, queryValue);
break;
case NOT_EQUALS:
matches = !value.equals(queryValue);
break;
Expand Down Expand Up @@ -64,13 +95,135 @@ default boolean executeQuery(BoolQuery boolQuery, Object value) {
case CONTAINS_NONE:
matches = executeContainsNoneQuery((Collection<?>) value, (Collection<?>) queryValue);
break;
case RANGE_OVERLAPS:
matches = executeRangeOverlapsQuery(field, (Collection<?>) value, queryValue);
break;
case RANGE_BELOW:
matches = executeRangeBelowQuery(field, (Collection<?>) value, queryValue);
break;
case RANGE_ABOVE:
matches = executeRangeAboveQuery(field, (Collection<?>) value, queryValue);
break;
default:
throw new UnexpectedEnumException(operator);
}

return matches;
}

default boolean executeRangeAboveQuery(Field field, Collection<?> values, Object queryValue){
switch (field.getValueType()) {
case INTEGER:
List<Integer> integerList = (List<Integer>) values;
Integer minInteger = Collections.min(integerList);
return minInteger > Integer.valueOf(queryValue.toString());
case FLOAT:
List<Double> doubleList = (List<Double>) values;
Double minDouble = Collections.min(doubleList);
return minDouble > Double.valueOf(queryValue.toString());
default:
throw new UnexpectedEnumException(field.getValueType());
}
}

default boolean executeRangeBelowQuery(Field field, Collection<?> values, Object queryValue){
switch (field.getValueType()) {
case RANGE:
List<Double> doubleList = (List<Double>) values;
Double maxDouble = Collections.max(doubleList);
return maxDouble < Double.valueOf(queryValue.toString());
default:
throw new UnexpectedEnumException(field.getValueType());
}
}

default boolean executeRangeOverlapsQuery(Field field, Collection<?> values, Object queryValue){
switch (field.getValueType()) {
case INTEGER:
List<Integer> integerList = (List<Integer>) values;
Integer maxInteger = Collections.max(integerList);
Integer minInteger = Collections.min(integerList);
return maxInteger >= Integer.valueOf(queryValue.toString()) && minInteger <= Integer.valueOf(queryValue.toString());
case FLOAT:
List<Double> doubleList = (List<Double>) values;
Double maxDouble = Collections.max(doubleList);
Double minDouble = Collections.min(doubleList);
return maxDouble >= Double.valueOf(queryValue.toString()) && minDouble <= Double.valueOf(queryValue.toString());
default:
throw new UnexpectedEnumException(field.getValueType());
}
}

default boolean executeSequenceEqualsQuery(Field field, Object value, Object queryValue){
if (field.getValueType() != ValueType.STRING){
throw new UnexpectedEnumException(field.getValueType());
}
String actualValue = value.toString();
if(queryValue == null){
return false;
}
String requestedValue = queryValue.toString();
if(actualValue.length() != requestedValue.length()){
return false;
}

String shiftedValue = "";
for(int i = 0; i < actualValue.length(); i++)
{
if(i!=0) {
shiftedValue = shift(shiftedValue);
}else{
shiftedValue = actualValue;
}
if (sequenceMatch(shiftedValue, requestedValue)){
return true;
}
}
return false;
}

private static boolean sequenceMatch(String actualValue, String requestedValue) {
for (int i = 0; i < actualValue.length(); i++) {
if(!isIupacMatch(requestedValue.charAt(i),actualValue.charAt(i))){
return false;
}
}
return true;
}

private static boolean isIupacMatch(Character iupac, Character base){
//NOT SUPPORTED:
//B => G or T or C not-A, B follows A
//V => G or C or A not-T (not-U), V follows U
//D => G or A or T not-C, D follows C
if(iupac.equals('B')||iupac.equals('B')||iupac.equals('B')){
throw new UnsupportedOperationException(String.format("IUPAC value '%s' is not supported.", iupac));
}
HashMap<Character, Set<Character>> iupacMap = new HashMap<>();
iupacMap.put('G',Set.of('G'));
iupacMap.put('A',Set.of('A'));
iupacMap.put('T',Set.of('T'));
iupacMap.put('C',Set.of('C'));
iupacMap.put('R',Set.of('G','A'));
iupacMap.put('Y',Set.of('T','C'));
iupacMap.put('M',Set.of('A','C'));
iupacMap.put('K',Set.of('G','T'));
iupacMap.put('S',Set.of('G','C'));
iupacMap.put('W',Set.of('A','T'));
iupacMap.put('H',Set.of('A','C', 'T'));
iupacMap.put('N',Set.of('G','A','T','C'));

Set<Character> values = iupacMap.get(iupac);
if(values.isEmpty()){
throw new UnsupportedOperationException(String.format("'%s' is not a valid IUPAC base.", iupac));
}
return values.contains(base);
}

private String shift(String sequence){
char first = sequence.charAt(0);
return sequence.substring(1) + first;
}

@SuppressWarnings("DuplicatedCode")
default boolean executeLessQuery(Field field, Object value, Object queryValue) {
boolean matches;
Expand Down Expand Up @@ -133,7 +286,7 @@ default BoolQuery postProcessQuery(
String stringQueryValue = query.getValue().toString();
if (stringQueryValue.startsWith(FIELD_PREFIX)) {
String fieldId = stringQueryValue.substring(FIELD_PREFIX.length());
query = BoolQuery.builder().field(query.getField()).operator(query.getOperator())
query = BoolQuery.builder().multiMode(query.getMultiMode()).field(query.getField()).operator(query.getOperator())
.value(variant.getValue(variant.getVcfMetadata().getField(fieldId), sampleContext))
.build();
}
Expand Down
71 changes: 61 additions & 10 deletions src/main/java/org/molgenis/vcf/decisiontree/filter/VcfMetadata.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
package org.molgenis.vcf.decisiontree.filter;

import static java.util.Objects.requireNonNull;
import static org.molgenis.vcf.decisiontree.filter.model.FieldType.COMMON;
import static org.molgenis.vcf.decisiontree.filter.model.FieldType.INFO_VEP;
import static org.molgenis.vcf.decisiontree.filter.model.FieldType.SAMPLE;
import static org.molgenis.vcf.decisiontree.filter.model.FieldType.*;
import static org.molgenis.vcf.decisiontree.filter.model.ValueCount.Type.VARIABLE;
import static org.molgenis.vcf.decisiontree.utils.VcfUtils.FIELD_TOKEN_SEPARATOR;
import static org.molgenis.vcf.decisiontree.utils.VcfUtils.toFieldType;

Expand All @@ -14,16 +13,15 @@
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.molgenis.vcf.decisiontree.filter.model.Field;
import org.molgenis.vcf.decisiontree.filter.model.FieldImpl;
import org.molgenis.vcf.decisiontree.filter.model.FieldType;
import org.molgenis.vcf.decisiontree.filter.model.MissingField;
import org.molgenis.vcf.decisiontree.filter.model.ValueCount;

import org.molgenis.vcf.decisiontree.filter.model.*;
import org.molgenis.vcf.decisiontree.filter.model.ValueCount.Type;
import org.molgenis.vcf.decisiontree.filter.model.ValueCount.ValueCountBuilder;
import org.molgenis.vcf.decisiontree.filter.model.ValueType;
import org.molgenis.vcf.decisiontree.runner.info.NestedHeaderLine;
import org.molgenis.vcf.utils.UnexpectedEnumException;
import org.molgenis.vcf.utils.metadata.MetadataService;
import org.molgenis.vcf.utils.model.FieldMetadata;
import org.molgenis.vcf.utils.model.NumberType;

/**
* {@link VCFHeader} wrapper that works with nested metadata (e.g. CSQ INFO fields).
Expand All @@ -35,11 +33,14 @@ public class VcfMetadata {
private final NestedHeaderLine nestedVepHeaderLine;
private final NestedHeaderLine nestedGenotypeHeaderLine;

private final MetadataService metadataService;

public VcfMetadata(VCFHeader vcfHeader, NestedHeaderLine nestedVepHeaderLine,
NestedHeaderLine nestedGenotypeHeaderLine, boolean strict) {
NestedHeaderLine nestedGenotypeHeaderLine, MetadataService metadataService, boolean strict) {
this.vcfHeader = requireNonNull(vcfHeader);
this.nestedVepHeaderLine = requireNonNull(nestedVepHeaderLine);
this.nestedGenotypeHeaderLine = requireNonNull(nestedGenotypeHeaderLine);
this.metadataService = requireNonNull(metadataService);
this.strict = strict;
}

Expand Down Expand Up @@ -152,10 +153,31 @@ private FieldImpl toCommonField(List<String> fieldTokens) {
}

private Field toCompoundField(List<String> fieldTokens, FieldType fieldType) {
//HERE?
if (fieldTokens.size() != 2) {
throw new InvalidNumberOfTokensException(fieldTokens, fieldType, 2);
}
String field = fieldTokens.get(1);
if(fieldType == INFO){
if(metadataService.getFieldMetadatas() != null && metadataService.getFieldMetadatas().getInfo().containsKey(field)){
FieldMetadata fieldMetadata = metadataService.getFieldMetadatas().getInfo().get(field);
return FieldImpl.builder().id(field).fieldType(INFO)
.valueType(mapValueType(fieldMetadata.getField().getType()))
.valueCount(
mapValueCount(fieldMetadata.getField().getNumberType(), fieldMetadata.getField().getNumberCount(), fieldMetadata.getField().isRequired()))
.separator(fieldMetadata.getField().getSeparator()).build();
}
}
if(fieldType == FORMAT) {
if (metadataService.getFieldMetadatas() != null && metadataService.getFieldMetadatas().getFormat().containsKey(field)) {
org.molgenis.vcf.utils.model.Field formatFieldMetadata = metadataService.getFieldMetadatas().getFormat().get(field);
return FieldImpl.builder().id(field).fieldType(FORMAT)
.valueType(mapValueType(formatFieldMetadata.getType()))
.valueCount(
mapValueCount(formatFieldMetadata.getNumberType(), formatFieldMetadata.getNumberCount(), formatFieldMetadata.isRequired()))
.separator(formatFieldMetadata.getSeparator()).build();
}
}
VCFCompoundHeaderLine vcfCompoundHeaderLine = getVcfCompoundHeaderLine(fieldType, field);

if (vcfCompoundHeaderLine == null) {
Expand Down Expand Up @@ -194,8 +216,37 @@ private Field toCompoundField(List<String> fieldTokens, FieldType fieldType) {
.valueType(valueType)
.valueCount(builder.build())
.build();
}

private ValueType mapValueType(org.molgenis.vcf.utils.model.ValueType type) {
return switch (type) {
case INTEGER -> ValueType.INTEGER;
case FLOAT -> ValueType.FLOAT;
case FLAG -> ValueType.FLAG;
case CHARACTER -> ValueType.CHARACTER;
case STRING, CATEGORICAL -> ValueType.STRING;
case RANGE -> ValueType.RANGE;
//noinspection UnnecessaryDefault
default -> throw new UnexpectedEnumException(type);
};
}

private ValueCount mapValueCount(NumberType numberType, Integer numberCount, boolean required) {
return ValueCount.builder().type(mapNumberType(numberType)).count(numberCount)
.nullable(!required).build();
}

private Type mapNumberType(NumberType numberType) {
return switch (numberType) {
case NUMBER -> Type.FIXED;
case PER_ALT -> Type.A;
case PER_ALT_AND_REF -> Type.R;
case PER_GENOTYPE -> Type.G;
case OTHER -> VARIABLE;
//noinspection UnnecessaryDefault
default -> throw new UnexpectedEnumException(numberType);
};
}
private VCFCompoundHeaderLine getVcfCompoundHeaderLine(FieldType fieldType, String field) {
VCFCompoundHeaderLine vcfCompoundHeaderLine;
switch (fieldType) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.molgenis.vcf.decisiontree.runner.info.GenotypeMetadataMapper;
import org.molgenis.vcf.decisiontree.runner.info.NestedHeaderLine;
import org.molgenis.vcf.decisiontree.runner.info.VepMetadataParser;
import org.molgenis.vcf.utils.metadata.MetadataService;

/**
* {@link VCFFileReader} wrapper that works with nested metadata and data (e.g. CSQ INFO fields).
Expand All @@ -21,13 +22,13 @@ public class VcfReader implements AutoCloseable {
private boolean inited = false;
private NestedHeaderLine vepNestedHeaderLine = null;
private NestedHeaderLine gtNestedHeaderLine = null;

private final MetadataService metadataService;
public VcfReader(VCFFileReader vcfFileReader, VepMetadataParser vepMetadataParser,
GenotypeMetadataMapper genotypeMetadataMapper,
boolean strict) {
GenotypeMetadataMapper genotypeMetadataMapper, MetadataService metadataService, boolean strict) {
this.vcfFileReader = requireNonNull(vcfFileReader);
this.vepMetadataParser = requireNonNull(vepMetadataParser);
this.genotypeMetadataMapper = requireNonNull(genotypeMetadataMapper);
this.metadataService = requireNonNull(metadataService);
this.strict = strict;
}

Expand All @@ -46,7 +47,7 @@ public Stream<VcfRecord> stream() {
public VcfMetadata getMetadata() {
initNestedMeta();
return new VcfMetadata(vcfFileReader.getFileHeader(), vepNestedHeaderLine, gtNestedHeaderLine,
strict);
metadataService, strict);
}

@Override
Expand Down
Loading