Skip to content

Commit

Permalink
Merge pull request #282 from opencb/TASK-4682
Browse files Browse the repository at this point in the history
TASK-4682 - Error in VariantExport VCF : Duplicate allele (pre-step to exomiser)
  • Loading branch information
j-coll authored Nov 25, 2024
2 parents fbae153 + 2520821 commit 5c2b5fc
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.tools.commons.Converter;
import org.opencb.commons.datastore.core.ObjectMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.DecimalFormat;
import java.util.*;
Expand All @@ -38,6 +40,7 @@
* Created by jtarraga on 07/02/17.
*/
public abstract class VariantContextConverter<T> implements Converter<T, VariantContext> {
private final Logger logger = LoggerFactory.getLogger(VariantContextConverter.class);

public static final DecimalFormat DECIMAL_FORMAT_7 = new DecimalFormat("#.#######");
public static final DecimalFormat DECIMAL_FORMAT_3 = new DecimalFormat("#.###");
Expand Down Expand Up @@ -148,8 +151,7 @@ protected static MutablePair<Integer, Integer> adjustedVariantStart(
protected static String getReferenceBase(String chromosome, int from, int to, Map<Integer, Character> referenceAlleles) {
int length = to - from;
if (length < 0) {
throw new IllegalStateException(
"Sequence length is negative: chromosome " + chromosome + " from " + from + " to " + to);
return "";
}
StringBuilder sb = new StringBuilder(length);
for (int i = from; i < to; i++) {
Expand Down Expand Up @@ -246,15 +248,15 @@ protected double getQuality(List<Map<String, String>> fileAttributes) {
}
} catch (NumberFormatException e) {
// Nothing to do
e.getMessage();
logger.warn("Invalid QUAL value found: " + attrs.get(StudyEntry.QUAL));
}
}
}
}
return (qual == Double.MAX_VALUE ? VariantContext.NO_LOG10_PERROR : (-0.1 * qual));
}

protected List<Genotype> getGenotypes(List<String> alleleList, List<String> sampleDataKeys, BiFunction<String, String, String> getSampleData) {
protected List<Genotype> getGenotypes(List<String> alleleList, List<String> sampleDataKeys, BiFunction<String, String, String> getSampleData, Set<Integer> discardedAlleles) {
String refAllele = alleleList.get(0);
Set<Integer> noCallAlleles = getNoCallAlleleIdx(alleleList);

Expand All @@ -265,7 +267,7 @@ protected List<Genotype> getGenotypes(List<String> alleleList, List<String> samp
sampleDataKeys = this.sampleFormats;
}

for (String sampleName : this.sampleNames) {
samplesLoop: for (String sampleName : this.sampleNames) {
GenotypeBuilder genotypeBuilder = new GenotypeBuilder().name(sampleName);
for (String key : sampleDataKeys) {
String value = getSampleData.apply(sampleName, key);
Expand All @@ -279,7 +281,20 @@ protected List<Genotype> getGenotypes(List<String> alleleList, List<String> samp
List<Allele> alleles = new ArrayList<>();
for (int gtIdx : genotype.getAllelesIdx()) {
if (gtIdx < alleleList.size() && gtIdx >= 0 && !noCallAlleles.contains(gtIdx)) { // .. AND NOT a nocall allele
alleles.add(Allele.create(alleleList.get(gtIdx), gtIdx == 0)); // allele is ref. if the alleleIndex is 0
String alternate = alleleList.get(gtIdx);
if (discardedAlleles.contains(gtIdx)) {
// If this genotype contains a duplicated allele, and it is not the first occurrence, skip it
logger.warn("Skipping allele '" + alleleToString(alternate) + "' for sample '" + sampleName + "'");
genotypes.add(new GenotypeBuilder().name(sampleName)
.alleles(Arrays.asList(
Allele.create(NO_CALL_ALLELE, false),
Allele.create(NO_CALL_ALLELE, false)
)).make()
);
// skip the rest of the sample data
continue samplesLoop;
}
alleles.add(Allele.create(alternate, gtIdx == 0)); // allele is ref. if the alleleIndex is 0
} else {
alleles.add(Allele.create(NO_CALL_ALLELE, false)); // genotype of a secondary alternate, or an actual missing
}
Expand Down Expand Up @@ -341,7 +356,9 @@ private int[] getInts(String value) {
return ints;
}

protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List<String> alleleList, boolean isNoVariation, Set<String> filters, double qual, ObjectMap attributes, List<Genotype> genotypes) {
protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List<String> alleleList,
boolean isNoVariation, Set<String> filters, double qual, ObjectMap attributes,
List<Genotype> genotypes, Set<Integer> discardedAlleles) {
String refAllele = alleleList.get(0);
VariantContextBuilder variantContextBuilder = new VariantContextBuilder()
.chr(chromosome)
Expand All @@ -355,7 +372,15 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en
if (isNoVariation && alleleList.get(1).isEmpty()) {
variantContextBuilder.alleles(refAllele);
} else {
variantContextBuilder.alleles(alleleList.stream().filter(a -> !a.equals(NO_CALL_ALLELE)).collect(Collectors.toList()));
List<String> finalAlleles = new ArrayList<>(alleleList);
for (Integer i : discardedAlleles) {
finalAlleles.set(i, null);
}
finalAlleles = finalAlleles.stream()
.filter(Objects::nonNull)
.filter(a -> !a.equals(NO_CALL_ALLELE))
.collect(Collectors.toList());
variantContextBuilder.alleles(finalAlleles);
}

if (genotypes.isEmpty()) {
Expand All @@ -364,15 +389,63 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en
variantContextBuilder.genotypes(genotypes);
}


if (isSymbolic(alleleList.get(1))) {
attributes.append(VCFConstants.END_KEY, end);
}
variantContextBuilder.attributes(attributes);

variantContextBuilder.id(idForVcf);

return variantContextBuilder.make();
try {
return variantContextBuilder.make();
} catch (RuntimeException e) {
throw new IllegalArgumentException(
"Error creating VariantContext: " + chromosome + ":" + start + "-" + end + ":" + alleleList, e);
}
}

/**
* Check if the allele is a symbolic allele other than <NON_REF> or <*>
* @param allele Allele
* @return True if the allele is a symbolic allele
*/
protected static boolean isSymbolic(String allele) {
return allele.startsWith("<") && allele.endsWith(">") && !isNonRef(allele);
}

protected static boolean isNonRef(String allele) {
return allele.equals(Allele.NON_REF_STRING) || allele.equals(Allele.UNSPECIFIED_ALTERNATE_ALLELE_STRING);
}

protected Set<String> getDuplicatedAlleles(List<String> alleleList) {
Set<String> duplicatedAlleles;
if (alleleList.size() > 2 && new HashSet<>(alleleList).size() != alleleList.size()) {
Set<String> allelesSet = new HashSet<>();

duplicatedAlleles = new HashSet<>();
for (String allele : alleleList) {
if (!allelesSet.add(allele)) {
duplicatedAlleles.add(allele);
}
}
} else {
duplicatedAlleles = Collections.emptySet();
}
return duplicatedAlleles;
}

protected abstract Object getStudy(T variant);

protected abstract Iterator<String> getStudiesId(T variant);

protected static String allelesToString(Collection<String> alleles) {
return alleles.stream()
.map(VariantContextConverter::alleleToString)
.collect(Collectors.joining(","));
}

private static String alleleToString(String a) {
return a.length() > 20 ? (a.substring(0, 10) + "...[" + a.length() + "]") : a;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ public VariantAvroToVariantContextConverter(String study, List<String> sampleNam

@Override
public VariantContext convert(Variant variant) {
try {
return createContext(variant);
} catch (RuntimeException e) {
try {
logger.warn("Error creating VariantContext for variant " + variant);
logger.warn("JSON : " + variant.toJson());
} catch (RuntimeException e2) {
e.addSuppressed(e2);
}
throw e;
}
}

protected VariantContext createContext(Variant variant) {
init(variant);

StudyEntry studyEntry = getStudy(variant);
Expand All @@ -59,10 +73,32 @@ public VariantContext convert(Variant variant) {
.stream()
.map(entry -> entry.getCall() == null ? null : entry.getCall().getVariantId())
.iterator());
Pair<Integer, Integer> adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles);
Pair<Integer, Integer> adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles, Collections.emptySet());
int start = adjustedStartEndPositions.getLeft();
int end = adjustedStartEndPositions.getRight();
List<String> alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);
Set<Integer> discardedAlleleIdx = getDiscardedAlleles(variant, alleleList);
if (!discardedAlleleIdx.isEmpty()) {
Set<String> duplicatedAlleles = getDuplicatedAlleles(alleleList);
List<String> otherDiscardedAlleles = new ArrayList<>(discardedAlleleIdx.size());
for (Integer alleleIdx: discardedAlleleIdx) {
String allele = alleleList.get(alleleIdx);
if (!duplicatedAlleles.contains(allele)) {
otherDiscardedAlleles.add(allele);
}
}
// If there are duplicated alleles, we need to re-adjust the start/end positions and the alleles
adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles, discardedAlleleIdx);
start = adjustedStartEndPositions.getLeft();
end = adjustedStartEndPositions.getRight();
alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);

logger.warn("Discard alleles from variant " + chromosome + ":" + start + "-" + end + " , "
+ "Alleles : " + allelesToString(alleleList) + " , "
+ "discarded allele indexes : " + discardedAlleleIdx + ", "
+ "duplicated alleles: " + allelesToString(duplicatedAlleles) + ", "
+ "other discarded alleles: " + allelesToString(otherDiscardedAlleles));
}
boolean isNoVariation = type.equals(VariantType.NO_VARIATION);

// ID
Expand Down Expand Up @@ -102,31 +138,39 @@ public VariantContext convert(Variant variant) {
}

// SAMPLES
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData);

return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes);
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData, discardedAlleleIdx);
return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, discardedAlleleIdx);
}

/**
* Adjust start/end if a reference base is required due to an empty allele. All variants are checked due to SecAlts.
* @param variant {@link Variant} object.
* @param study Study
*
* @param variant {@link Variant} object.
* @param study Study
* @param discardedAlleles Set of alleles that are going to be discarded
* @return Pair<Integer, Integer> The adjusted (or same) start/end position e.g. SV and MNV as SecAlt, INDEL, etc.
*/
public static Pair<Integer, Integer> adjustedVariantStart(Variant variant, StudyEntry study, Map<Integer, Character> referenceAlleles) {
public static Pair<Integer, Integer> adjustedVariantStart(Variant variant, StudyEntry study, Map<Integer, Character> referenceAlleles,
Set<Integer> discardedAlleles) {
if (variant.getType().equals(VariantType.NO_VARIATION)) {
return new ImmutablePair<>(variant.getStart(), variant.getEnd());
}
MutablePair<Integer, Integer> pos = adjustedVariantStart(variant.getStart(), variant.getEnd(), variant.getReference(), variant.getAlternate(), referenceAlleles, null);

int alleleIdx = 2;
for (AlternateCoordinate alternateCoordinate : study.getSecondaryAlternates()) {
int alternateStart = alternateCoordinate.getStart() == null ? variant.getStart() : alternateCoordinate.getStart().intValue();
int alternateEnd = alternateCoordinate.getEnd() == null ? variant.getEnd() : alternateCoordinate.getEnd().intValue();
if (discardedAlleles.contains(alleleIdx)) {
// Do not adjust start/end based on discarded alleles
} else {
int alternateStart = alternateCoordinate.getStart() == null ? variant.getStart() : alternateCoordinate.getStart().intValue();
int alternateEnd = alternateCoordinate.getEnd() == null ? variant.getEnd() : alternateCoordinate.getEnd().intValue();

String reference = alternateCoordinate.getReference() == null ? variant.getReference() : alternateCoordinate.getReference();
String alternate = alternateCoordinate.getAlternate() == null ? variant.getAlternate() : alternateCoordinate.getAlternate();
String reference = alternateCoordinate.getReference() == null ? variant.getReference() : alternateCoordinate.getReference();
String alternate = alternateCoordinate.getAlternate() == null ? variant.getAlternate() : alternateCoordinate.getAlternate();

adjustedVariantStart(alternateStart, alternateEnd, reference, alternate, referenceAlleles, pos);
adjustedVariantStart(alternateStart, alternateEnd, reference, alternate, referenceAlleles, pos);
}
alleleIdx++;
}
return pos;
}
Expand Down Expand Up @@ -163,31 +207,40 @@ public List<String> buildAlleles(Variant variant, Pair<Integer, Integer> adjuste
return alleles;
}

/*
// this function was moved to the parent class: VariantContextConverter
public String buildAllele(String chromosome, Integer start, Integer end, String allele, Pair<Integer, Integer> adjustedRange) {
if (start.equals(adjustedRange.getLeft()) && end.equals(adjustedRange.getRight())) {
return allele; // same start / end
protected Set<Integer> getDiscardedAlleles(Variant variant, List<String> alleleList) {
if (alleleList.size() <= 2) {
return Collections.emptySet();
}
if (StringUtils.startsWith(allele, "*")) {
return allele; // no need
}
return getReferenceBase(chromosome, adjustedRange.getLeft(), start) + allele
+ getReferenceBase(chromosome, end, adjustedRange.getRight());
}
*/

/*
// this function was moved to the parent class: VariantContextConverter
private String getReferenceBase(String chromosome, Integer from, Integer to) {
int length = to - from;
if (length < 0) {
throw new IllegalStateException(
"Sequence length is negative: chromosome " + chromosome + " from " + from + " to " + to);

String mainAlternate = alleleList.get(1);
Set<String> allelesSet = new HashSet<>();
allelesSet.add(alleleList.get(0));
allelesSet.add(mainAlternate);

boolean symbolicMainAlt = isSymbolic(mainAlternate);
Set<Integer> discardedAlleles = new HashSet<>();
for (int i = 2; i < alleleList.size(); i++) {
String allele = alleleList.get(i);
if (!allelesSet.add(allele)) {
// Remove duplicated
discardedAlleles.add(i);
} else if (symbolicMainAlt != isSymbolic(allele) && !isNonRef(allele)) {
// If the main alternate is symbolic, all other alleles must be symbolic
// If the main alternate is not symbolic, all other alleles must not be symbolic
// Do not discard <NON_REF> or <*>
discardedAlleles.add(i);
} else if (symbolicMainAlt && isSymbolic(allele)) {
// If the main alternate is symbolic, and the allele is symbolic, check if they have the same coordinates
AlternateCoordinate alternateCoordinate = variant.getStudies().get(0).getSecondaryAlternates().get(i - 2);
if ((alternateCoordinate.getStart() != null && !alternateCoordinate.getStart().equals(variant.getStart()))
|| (alternateCoordinate.getEnd() != null && !alternateCoordinate.getEnd().equals(variant.getEnd()))) {
discardedAlleles.add(i);
}
}
}
return StringUtils.repeat('N', length); // current return default base TODO load reference sequence

return discardedAlleles;
}
*/

private void addCohortStatsMultiInfoField(StudyEntry studyEntry, Map<String, Object> attributes) {
if (studyEntry.getStats() == null || studyEntry.getStats().size() == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
/**
* Created by jtarraga on 07/02/17.
*/
public class VariantProtoToVariantContextConverter extends VariantContextConverter<VariantProto.Variant> {
@Deprecated
public abstract class VariantProtoToVariantContextConverter extends VariantContextConverter<VariantProto.Variant> {

private final Logger logger = LoggerFactory.getLogger(VariantProtoToVariantContextConverter.class);

Expand Down Expand Up @@ -60,6 +61,7 @@ public VariantContext convert(VariantProto.Variant variant) {
int start = adjustedStartEndPositions.getLeft();
int end = adjustedStartEndPositions.getRight();
List<String> alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);
Set<String> duplicatedAlleles = getDuplicatedAlleles(alleleList);
boolean isNoVariation = type.equals(VariantProto.VariantType.NO_VARIATION);

// ID
Expand Down Expand Up @@ -96,9 +98,9 @@ public VariantContext convert(VariantProto.Variant variant) {
// SAMPLES
BiFunction<String, String, String> getSampleData = (sampleName, id) -> getSampleData(studyEntry, formatPositions, sampleName, id);

List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData);
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData, null);

return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes);
return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, null);
}

public String getSampleData(VariantProto.StudyEntry studyEntry, Map<String, Integer> formatPositions, String sampleName, String field) {
Expand Down
Loading

0 comments on commit 5c2b5fc

Please sign in to comment.