Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TASK-4682 - Error in VariantExport VCF : Duplicate allele (pre-step to exomiser) #282

Merged
merged 5 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.tools.commons.Converter;
import org.opencb.commons.datastore.core.ObjectMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.DecimalFormat;
import java.util.*;
Expand All @@ -38,6 +40,7 @@
* Created by jtarraga on 07/02/17.
*/
public abstract class VariantContextConverter<T> implements Converter<T, VariantContext> {
private final Logger logger = LoggerFactory.getLogger(VariantContextConverter.class);

public static final DecimalFormat DECIMAL_FORMAT_7 = new DecimalFormat("#.#######");
public static final DecimalFormat DECIMAL_FORMAT_3 = new DecimalFormat("#.###");
Expand Down Expand Up @@ -148,8 +151,7 @@ protected static MutablePair<Integer, Integer> adjustedVariantStart(
protected static String getReferenceBase(String chromosome, int from, int to, Map<Integer, Character> referenceAlleles) {
int length = to - from;
if (length < 0) {
throw new IllegalStateException(
"Sequence length is negative: chromosome " + chromosome + " from " + from + " to " + to);
return "";
}
StringBuilder sb = new StringBuilder(length);
for (int i = from; i < to; i++) {
Expand Down Expand Up @@ -246,15 +248,15 @@ protected double getQuality(List<Map<String, String>> fileAttributes) {
}
} catch (NumberFormatException e) {
// Nothing to do
e.getMessage();
logger.warn("Invalid QUAL value found: " + attrs.get(StudyEntry.QUAL));
}
}
}
}
return (qual == Double.MAX_VALUE ? VariantContext.NO_LOG10_PERROR : (-0.1 * qual));
}

protected List<Genotype> getGenotypes(List<String> alleleList, List<String> sampleDataKeys, BiFunction<String, String, String> getSampleData) {
protected List<Genotype> getGenotypes(List<String> alleleList, List<String> sampleDataKeys, BiFunction<String, String, String> getSampleData, Set<Integer> discardedAlleles) {
String refAllele = alleleList.get(0);
Set<Integer> noCallAlleles = getNoCallAlleleIdx(alleleList);

Expand All @@ -265,7 +267,7 @@ protected List<Genotype> getGenotypes(List<String> alleleList, List<String> samp
sampleDataKeys = this.sampleFormats;
}

for (String sampleName : this.sampleNames) {
samplesLoop: for (String sampleName : this.sampleNames) {
GenotypeBuilder genotypeBuilder = new GenotypeBuilder().name(sampleName);
for (String key : sampleDataKeys) {
String value = getSampleData.apply(sampleName, key);
Expand All @@ -279,7 +281,20 @@ protected List<Genotype> getGenotypes(List<String> alleleList, List<String> samp
List<Allele> alleles = new ArrayList<>();
for (int gtIdx : genotype.getAllelesIdx()) {
if (gtIdx < alleleList.size() && gtIdx >= 0 && !noCallAlleles.contains(gtIdx)) { // .. AND NOT a nocall allele
alleles.add(Allele.create(alleleList.get(gtIdx), gtIdx == 0)); // allele is ref. if the alleleIndex is 0
String alternate = alleleList.get(gtIdx);
if (discardedAlleles.contains(gtIdx)) {
// If this genotype contains a duplicated allele, and it is not the first occurrence, skip it
logger.warn("Skipping allele '" + alleleToString(alternate) + "' for sample '" + sampleName + "'");
genotypes.add(new GenotypeBuilder().name(sampleName)
.alleles(Arrays.asList(
Allele.create(NO_CALL_ALLELE, false),
Allele.create(NO_CALL_ALLELE, false)
)).make()
);
// skip the rest of the sample data
continue samplesLoop;
}
alleles.add(Allele.create(alternate, gtIdx == 0)); // allele is ref. if the alleleIndex is 0
} else {
alleles.add(Allele.create(NO_CALL_ALLELE, false)); // genotype of a secondary alternate, or an actual missing
}
Expand Down Expand Up @@ -341,7 +356,9 @@ private int[] getInts(String value) {
return ints;
}

protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List<String> alleleList, boolean isNoVariation, Set<String> filters, double qual, ObjectMap attributes, List<Genotype> genotypes) {
protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List<String> alleleList,
boolean isNoVariation, Set<String> filters, double qual, ObjectMap attributes,
List<Genotype> genotypes, Set<Integer> discardedAlleles) {
String refAllele = alleleList.get(0);
VariantContextBuilder variantContextBuilder = new VariantContextBuilder()
.chr(chromosome)
Expand All @@ -355,7 +372,15 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en
if (isNoVariation && alleleList.get(1).isEmpty()) {
variantContextBuilder.alleles(refAllele);
} else {
variantContextBuilder.alleles(alleleList.stream().filter(a -> !a.equals(NO_CALL_ALLELE)).collect(Collectors.toList()));
List<String> finalAlleles = new ArrayList<>(alleleList);
for (Integer i : discardedAlleles) {
finalAlleles.set(i, null);
}
finalAlleles = finalAlleles.stream()
.filter(Objects::nonNull)
.filter(a -> !a.equals(NO_CALL_ALLELE))
.collect(Collectors.toList());
variantContextBuilder.alleles(finalAlleles);
}

if (genotypes.isEmpty()) {
Expand All @@ -364,15 +389,63 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en
variantContextBuilder.genotypes(genotypes);
}


if (isSymbolic(alleleList.get(1))) {
attributes.append(VCFConstants.END_KEY, end);
}
variantContextBuilder.attributes(attributes);

variantContextBuilder.id(idForVcf);

return variantContextBuilder.make();
try {
return variantContextBuilder.make();
} catch (RuntimeException e) {
throw new IllegalArgumentException(
"Error creating VariantContext: " + chromosome + ":" + start + "-" + end + ":" + alleleList, e);
}
}

/**
* Check if the allele is a symbolic allele other than <NON_REF> or <*>
* @param allele Allele
* @return True if the allele is a symbolic allele
*/
protected static boolean isSymbolic(String allele) {
return allele.startsWith("<") && allele.endsWith(">") && !isNonRef(allele);
}

protected static boolean isNonRef(String allele) {
return allele.equals(Allele.NON_REF_STRING) || allele.equals(Allele.UNSPECIFIED_ALTERNATE_ALLELE_STRING);
}

protected Set<String> getDuplicatedAlleles(List<String> alleleList) {
Set<String> duplicatedAlleles;
if (alleleList.size() > 2 && new HashSet<>(alleleList).size() != alleleList.size()) {
Set<String> allelesSet = new HashSet<>();

duplicatedAlleles = new HashSet<>();
for (String allele : alleleList) {
if (!allelesSet.add(allele)) {
duplicatedAlleles.add(allele);
}
}
} else {
duplicatedAlleles = Collections.emptySet();
}
return duplicatedAlleles;
}

protected abstract Object getStudy(T variant);

protected abstract Iterator<String> getStudiesId(T variant);

protected static String allelesToString(Collection<String> alleles) {
return alleles.stream()
.map(VariantContextConverter::alleleToString)
.collect(Collectors.joining(","));
}

private static String alleleToString(String a) {
return a.length() > 20 ? (a.substring(0, 10) + "...[" + a.length() + "]") : a;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ public VariantAvroToVariantContextConverter(String study, List<String> sampleNam

@Override
public VariantContext convert(Variant variant) {
try {
return createContext(variant);
} catch (RuntimeException e) {
try {
logger.warn("Error creating VariantContext for variant " + variant);
logger.warn("JSON : " + variant.toJson());
} catch (RuntimeException e2) {
e.addSuppressed(e2);
}
throw e;
}
}

protected VariantContext createContext(Variant variant) {
init(variant);

StudyEntry studyEntry = getStudy(variant);
Expand All @@ -59,10 +73,32 @@ public VariantContext convert(Variant variant) {
.stream()
.map(entry -> entry.getCall() == null ? null : entry.getCall().getVariantId())
.iterator());
Pair<Integer, Integer> adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles);
Pair<Integer, Integer> adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles, Collections.emptySet());
int start = adjustedStartEndPositions.getLeft();
int end = adjustedStartEndPositions.getRight();
List<String> alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);
Set<Integer> discardedAlleleIdx = getDiscardedAlleles(variant, alleleList);
if (!discardedAlleleIdx.isEmpty()) {
Set<String> duplicatedAlleles = getDuplicatedAlleles(alleleList);
List<String> otherDiscardedAlleles = new ArrayList<>(discardedAlleleIdx.size());
for (Integer alleleIdx: discardedAlleleIdx) {
String allele = alleleList.get(alleleIdx);
if (!duplicatedAlleles.contains(allele)) {
otherDiscardedAlleles.add(allele);
}
}
// If there are duplicated alleles, we need to re-adjust the start/end positions and the alleles
adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles, discardedAlleleIdx);
start = adjustedStartEndPositions.getLeft();
end = adjustedStartEndPositions.getRight();
alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);

logger.warn("Discard alleles from variant " + chromosome + ":" + start + "-" + end + " , "
+ "Alleles : " + allelesToString(alleleList) + " , "
+ "discarded allele indexes : " + discardedAlleleIdx + ", "
+ "duplicated alleles: " + allelesToString(duplicatedAlleles) + ", "
+ "other discarded alleles: " + allelesToString(otherDiscardedAlleles));
}
boolean isNoVariation = type.equals(VariantType.NO_VARIATION);

// ID
Expand Down Expand Up @@ -102,31 +138,39 @@ public VariantContext convert(Variant variant) {
}

// SAMPLES
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData);

return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes);
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData, discardedAlleleIdx);
return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, discardedAlleleIdx);
}

/**
* Adjust start/end if a reference base is required due to an empty allele. All variants are checked due to SecAlts.
* @param variant {@link Variant} object.
* @param study Study
*
* @param variant {@link Variant} object.
* @param study Study
* @param discardedAlleles Set of alleles that are going to be discarded
* @return Pair<Integer, Integer> The adjusted (or same) start/end position e.g. SV and MNV as SecAlt, INDEL, etc.
*/
public static Pair<Integer, Integer> adjustedVariantStart(Variant variant, StudyEntry study, Map<Integer, Character> referenceAlleles) {
public static Pair<Integer, Integer> adjustedVariantStart(Variant variant, StudyEntry study, Map<Integer, Character> referenceAlleles,
Set<Integer> discardedAlleles) {
if (variant.getType().equals(VariantType.NO_VARIATION)) {
return new ImmutablePair<>(variant.getStart(), variant.getEnd());
}
MutablePair<Integer, Integer> pos = adjustedVariantStart(variant.getStart(), variant.getEnd(), variant.getReference(), variant.getAlternate(), referenceAlleles, null);

int alleleIdx = 2;
for (AlternateCoordinate alternateCoordinate : study.getSecondaryAlternates()) {
int alternateStart = alternateCoordinate.getStart() == null ? variant.getStart() : alternateCoordinate.getStart().intValue();
int alternateEnd = alternateCoordinate.getEnd() == null ? variant.getEnd() : alternateCoordinate.getEnd().intValue();
if (discardedAlleles.contains(alleleIdx)) {
// Do not adjust start/end based on discarded alleles
} else {
int alternateStart = alternateCoordinate.getStart() == null ? variant.getStart() : alternateCoordinate.getStart().intValue();
int alternateEnd = alternateCoordinate.getEnd() == null ? variant.getEnd() : alternateCoordinate.getEnd().intValue();

String reference = alternateCoordinate.getReference() == null ? variant.getReference() : alternateCoordinate.getReference();
String alternate = alternateCoordinate.getAlternate() == null ? variant.getAlternate() : alternateCoordinate.getAlternate();
String reference = alternateCoordinate.getReference() == null ? variant.getReference() : alternateCoordinate.getReference();
String alternate = alternateCoordinate.getAlternate() == null ? variant.getAlternate() : alternateCoordinate.getAlternate();

adjustedVariantStart(alternateStart, alternateEnd, reference, alternate, referenceAlleles, pos);
adjustedVariantStart(alternateStart, alternateEnd, reference, alternate, referenceAlleles, pos);
}
alleleIdx++;
}
return pos;
}
Expand Down Expand Up @@ -163,31 +207,40 @@ public List<String> buildAlleles(Variant variant, Pair<Integer, Integer> adjuste
return alleles;
}

/*
// this function was moved to the parent class: VariantContextConverter
public String buildAllele(String chromosome, Integer start, Integer end, String allele, Pair<Integer, Integer> adjustedRange) {
if (start.equals(adjustedRange.getLeft()) && end.equals(adjustedRange.getRight())) {
return allele; // same start / end
protected Set<Integer> getDiscardedAlleles(Variant variant, List<String> alleleList) {
if (alleleList.size() <= 2) {
return Collections.emptySet();
}
if (StringUtils.startsWith(allele, "*")) {
return allele; // no need
}
return getReferenceBase(chromosome, adjustedRange.getLeft(), start) + allele
+ getReferenceBase(chromosome, end, adjustedRange.getRight());
}
*/

/*
// this function was moved to the parent class: VariantContextConverter
private String getReferenceBase(String chromosome, Integer from, Integer to) {
int length = to - from;
if (length < 0) {
throw new IllegalStateException(
"Sequence length is negative: chromosome " + chromosome + " from " + from + " to " + to);

String mainAlternate = alleleList.get(1);
Set<String> allelesSet = new HashSet<>();
allelesSet.add(alleleList.get(0));
allelesSet.add(mainAlternate);

boolean symbolicMainAlt = isSymbolic(mainAlternate);
Set<Integer> discardedAlleles = new HashSet<>();
for (int i = 2; i < alleleList.size(); i++) {
String allele = alleleList.get(i);
if (!allelesSet.add(allele)) {
// Remove duplicated
discardedAlleles.add(i);
} else if (symbolicMainAlt != isSymbolic(allele) && !isNonRef(allele)) {
// If the main alternate is symbolic, all other alleles must be symbolic
// If the main alternate is not symbolic, all other alleles must not be symbolic
// Do not discard <NON_REF> or <*>
discardedAlleles.add(i);
} else if (symbolicMainAlt && isSymbolic(allele)) {
// If the main alternate is symbolic, and the allele is symbolic, check if they have the same coordinates
AlternateCoordinate alternateCoordinate = variant.getStudies().get(0).getSecondaryAlternates().get(i - 2);
if ((alternateCoordinate.getStart() != null && !alternateCoordinate.getStart().equals(variant.getStart()))
|| (alternateCoordinate.getEnd() != null && !alternateCoordinate.getEnd().equals(variant.getEnd()))) {
discardedAlleles.add(i);
}
}
}
return StringUtils.repeat('N', length); // current return default base TODO load reference sequence

return discardedAlleles;
}
*/

private void addCohortStatsMultiInfoField(StudyEntry studyEntry, Map<String, Object> attributes) {
if (studyEntry.getStats() == null || studyEntry.getStats().size() == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
/**
* Created by jtarraga on 07/02/17.
*/
public class VariantProtoToVariantContextConverter extends VariantContextConverter<VariantProto.Variant> {
@Deprecated
public abstract class VariantProtoToVariantContextConverter extends VariantContextConverter<VariantProto.Variant> {

private final Logger logger = LoggerFactory.getLogger(VariantProtoToVariantContextConverter.class);

Expand Down Expand Up @@ -60,6 +61,7 @@ public VariantContext convert(VariantProto.Variant variant) {
int start = adjustedStartEndPositions.getLeft();
int end = adjustedStartEndPositions.getRight();
List<String> alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);
Set<String> duplicatedAlleles = getDuplicatedAlleles(alleleList);
boolean isNoVariation = type.equals(VariantProto.VariantType.NO_VARIATION);

// ID
Expand Down Expand Up @@ -96,9 +98,9 @@ public VariantContext convert(VariantProto.Variant variant) {
// SAMPLES
BiFunction<String, String, String> getSampleData = (sampleName, id) -> getSampleData(studyEntry, formatPositions, sampleName, id);

List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData);
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData, null);

return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes);
return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, null);
}

public String getSampleData(VariantProto.StudyEntry studyEntry, Map<String, Integer> formatPositions, String sampleName, String field) {
Expand Down
Loading
Loading