diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 000000000..5008ddfcf
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index 3aef1432f..f98d6fb6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,4 +20,10 @@ nbactions*.xml
nb-configuration.xml
-eqtl-mapping-pipeline/nb-configuration.xml
\ No newline at end of file
+eqtl-mapping-pipeline/nb-configuration.xml
+/eQTLInteractionAnalyser/nbproject/private/
+/eQTLInteractionAnalyser/build/
+/eQTLInteractionAnalyser/dist/
+/eQTLInteractionAnalyser2/build/
+/eQTLInteractionAnalyser2/dist/
+/eQTLInteractionAnalyser2/nbproject/private/
\ No newline at end of file
diff --git a/BinaryMetaAnalyzer/pom.xml b/BinaryMetaAnalyzer/pom.xml
index 09d582d05..d9314c876 100644
--- a/BinaryMetaAnalyzer/pom.xml
+++ b/BinaryMetaAnalyzer/pom.xml
@@ -7,13 +7,13 @@
1.0.2-SNAPSHOTBinaryMetaAnalyzer
- 1.0.4-SNAPSHOT
+ 1.0.7-SNAPSHOTjar${project.groupId}genetica-libraries
- 1.0.5
+ 1.0.7-SNAPSHOTBinaryMetaAnalyzer
diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java
index c75dbcec7..61f7602dc 100644
--- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java
+++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java
@@ -105,9 +105,7 @@ public void run(int bufferSize) throws IOException {
loadProbeAnnotation();
for (int permutation = 0; permutation < settings.getNrPermutations() + 1; permutation++) {
- finalEQTLs = new QTL[bufferSize];
- locationToStoreResult = 0;
- bufferHasOverFlown = false;
+ clearResultsBuffer();
maxSavedPvalue = -Double.MAX_VALUE;
// create dataset objects
System.out.println("Running permutation " + permutation);
@@ -115,7 +113,7 @@ public void run(int bufferSize) throws IOException {
System.out.println("Loading datasets");
for (int d = 0; d < datasets.length; d++) {
- datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), settings.getDatasetPrefix().get(d), permutation, settings.getDatasetannotations().get(d), probeAnnotation);
+ datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), settings.getDatasetnames().get(d), settings.getDatasetPrefix().get(d), permutation, settings.getDatasetannotations().get(d), probeAnnotation);
}
System.out.println("Loaded " + datasets.length + " datasets");
@@ -224,7 +222,7 @@ public void run(int bufferSize) throws IOException {
for (int probe = 0; probe < traitList.length; probe++) {
double metaAnalysisZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes);
double tScore = ZScores.zScoreToCorrelation(metaAnalysisZ, totalSampleSize);
- summedRsquare += tScore*tScore;
+ summedRsquare += tScore * tScore;
}
double newMetaZ = Correlation.convertCorrelationToZScore(totalSampleSize, Math.sqrt(summedRsquare));
double newMetaAnalysisP = Descriptives.convertZscoreToPvalue(newMetaZ);
@@ -244,10 +242,10 @@ public void run(int bufferSize) throws IOException {
double metaAnalysisZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes);
for (int i = 0; i < finalZScores[probe].length; i++) {
double tScore = ZScores.zScoreToCorrelation(finalZScores[probe][i], sampleSizes[i]);
- summedPerDataSet[i] += tScore*tScore;
+ summedPerDataSet[i] += tScore * tScore;
}
double tScore = ZScores.zScoreToCorrelation(metaAnalysisZ, totalSampleSize);
- summedRsquare += tScore*tScore;
+ summedRsquare += tScore * tScore;
}
for (int i = 0; i < summedPerDataSet.length; i++) {
@@ -261,7 +259,7 @@ public void run(int bufferSize) throws IOException {
MetaQTL4MetaTrait t = new MetaQTL4MetaTrait(21, "Microbe_Components", "-", -1, -1, "", traitList[0].getPlatformIds());
QTL q = new QTL(newMetaAnalysisP, t, snp, BaseAnnot.toByte(alleleAssessed), newMetaZ, BaseAnnot.toByteArray(alleles), summedPerDataSet, sampleSizes); // sort buffer if needed.
addEQTL(q);
- } else {
+ } else {
System.out.println("Error in procedure.");
}
}
@@ -546,4 +544,11 @@ private void writeBuffer(String outdir, int permutation) throws IOException {
System.out.println(
"Done.");
}
+
+ private void clearResultsBuffer() {
+ Arrays.fill(finalEQTLs, null);
+ bufferHasOverFlown = false;
+ locationToStoreResult = 0;
+ maxSavedPvalue = -Double.MAX_VALUE;
+ }
}
diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java
index 03c109bcf..2c9167723 100644
--- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java
+++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java
@@ -46,7 +46,7 @@ public static void main(String[] args) {
settingsFile = args[0];
} else {
- System.out.println("Usage: settings.xml replacetext replacetextwith");
+ System.out.println("Usage of the binary meta-analysis: settings.xml replacetext replacetextwith");
System.exit(-1);
}
@@ -54,7 +54,7 @@ public static void main(String[] args) {
System.exit(0);
}
-
+
private MetaQTL4TraitAnnotation probeAnnotation;
private BinaryMetaAnalysisDataset[] datasets = new BinaryMetaAnalysisDataset[0];
private int[][] snpIndex;
@@ -105,24 +105,36 @@ public void run() throws IOException {
System.out.println("Loading probe annotation from: " + settings.getProbetranslationfile());
loadProbeAnnotation();
+ if (traitList.length == 0) {
+ System.err.println("Error: no annotation loaded.");
+ System.exit(-1);
+ }
+
for (int permutation = 0; permutation < settings.getNrPermutations() + 1; permutation++) {
+ clearResultsBuffer();
+
// create dataset objects
System.out.println("Running permutation " + permutation);
datasets = new BinaryMetaAnalysisDataset[settings.getDatasetlocations().size()];
System.out.println("Loading datasets");
for (int d = 0; d < datasets.length; d++) {
- datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), settings.getDatasetPrefix().get(d), permutation, settings.getDatasetannotations().get(d), probeAnnotation);
+ datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d),
+ settings.getDatasetnames().get(d),
+ settings.getDatasetPrefix().get(d),
+ permutation,
+ settings.getDatasetannotations().get(d),
+ probeAnnotation);
}
System.out.println("Loaded " + datasets.length + " datasets");
// create meta-analysis SNP index. have to recreate this every permutation,
// since the order of SNPs is generated at random.
System.out.println("Creating SNP index");
- createSNPIndex();
+ createSNPIndex(outdir);
System.out.println("Total of " + snpIndex.length + " SNPs");
System.out.println("Creating probe index");
- createProbeIndex();
+ createProbeIndex(outdir);
System.out.println("Total of " + probeIndex.length + " probes");
if (snpChr == null) {
@@ -245,9 +257,10 @@ public void run() throws IOException {
double metaZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes);
double p = Descriptives.convertZscoreToPvalue(metaZ);
- if (!Double.isNaN(p)) {
+ if (!Double.isNaN(p) && !Double.isNaN(metaZ)) {
// create output object
QTL q = new QTL(p, t, snp, BaseAnnot.toByte(alleleAssessed), metaZ, BaseAnnot.toByteArray(alleles), finalZScores[probe], sampleSizes); // sort buffer if needed.
+// System.out.println(q.getSNPId()+"\t"+q.getMetaTrait().getMetaTraitName()+"\t"+q.toString());
addEQTL(q);
} else {
// if (!printed) {
@@ -307,8 +320,9 @@ public void run() throws IOException {
double metaAnalysisP = Descriptives.convertZscoreToPvalue(metaAnalysisZ);
// create output object
- if (!Double.isNaN(metaAnalysisP)) {
+ if (!Double.isNaN(metaAnalysisP) && !Double.isNaN(metaAnalysisZ)) {
QTL q = new QTL(metaAnalysisP, t, snp, BaseAnnot.toByte(alleleAssessed), metaAnalysisZ, BaseAnnot.toByteArray(alleles), finalZScores[probe], sampleSizes); // sort buffer if needed.
+// System.out.println(q.getSNPId()+"\t"+q.getMetaTrait().getMetaTraitName()+"\t"+q.toString());
addEQTL(q);
}
}
@@ -332,7 +346,7 @@ public void run() throws IOException {
*/
}
- private void createSNPIndex() throws IOException {
+ private void createSNPIndex(String outdir) throws IOException {
HashSet confineToTheseSNPs = null;
if (settings.getSNPSelection() != null) {
@@ -384,6 +398,21 @@ private void createSNPIndex() throws IOException {
}
}
}
+
+ TextFile tf = new TextFile(outdir + "snpindex.txt", TextFile.W);
+ String header = "metaID";
+ for (int d = 0; d < datasets.length; d++) {
+ header += "\t" + datasets[d].getName() + "-sid";
+ }
+ tf.writeln(header);
+ for (int s = 0; s < snpList.length; s++) {
+ String ln = snpList[s];
+ for (int d = 0; d < datasets.length; d++) {
+ ln += "\t" + snpIndex[s][d];
+ }
+ tf.writeln(ln);
+ }
+ tf.close();
}
private void loadProbeAnnotation() throws IOException {
@@ -432,31 +461,73 @@ private void loadSNPAnnotation() throws IOException {
}
// index the probes
- private void createProbeIndex() throws IOException {
-
+ private void createProbeIndex(String outdir) throws IOException {
+
HashSet confineToTheseProbes = null;
- if (settings.getProbeselection()!= null) {
+ if (settings.getProbeselection() != null) {
System.out.println("Selecting Probes from file: " + settings.getProbeselection());
confineToTheseProbes = new HashSet();
TextFile tf = new TextFile(settings.getProbeselection(), TextFile.R);
confineToTheseProbes.addAll(tf.readAsArrayList());
tf.close();
-
System.out.println(confineToTheseProbes.size() + " Probes loaded.");
}
-
+
+ System.out.println("");
probeIndex = new Integer[traitList.length][datasets.length];
+
for (int d = 0; d < datasets.length; d++) {
String[] probes = datasets[d].getProbeList();
- int platformId = probeAnnotation.getPlatformId(settings.getDatasetannotations().get(d));
+ int platformId = probeAnnotation.getPlatformId(datasets[d].getPlatform());
+
+ HashMap traitHashForPlatform = probeAnnotation.getTraitHashForPlatform(platformId);
+ System.out.println(probeAnnotation.getTraitHashPerPlatform().size());
+
+ System.out.println(datasets[d].getName() + "\t" + platformId + "\t" + datasets[d].getPlatform() + "\t" + traitHashForPlatform.size());
for (int p = 0; p < probes.length; p++) {
+
+ MetaQTL4MetaTrait t = traitHashForPlatform.get(probes[p]);
+ int index = traitMap.get(t);
+
+ if (probes[p].equals("60437")) {
+ if (t != null) {
+ System.out.println(t.getMetaTraitId());
+ } else {
+ System.out.println("not found");
+ }
+ }
+
if (confineToTheseProbes == null || confineToTheseProbes.contains(probes[p])) {
- MetaQTL4MetaTrait t = probeAnnotation.getTraitForPlatformId(platformId, probes[p]);
- int index = traitMap.get(t);
probeIndex[index][d] = p;
}
}
}
+
+ System.out.println("");
+
+ TextFile out = new TextFile(outdir + "probeindex.txt", TextFile.W);
+
+ String header = "metaID";
+ for (int d = 0; d < datasets.length; d++) {
+ header += "\t" + datasets[d].getName() + "-pid\t" + datasets[d].getName() + "-probename";
+ }
+ out.writeln(header);
+ for (int p = 0; p < probeIndex.length; p++) {
+
+ String lnout = "" + traitList[p].getMetaTraitId();
+ for (int d = 0; d < datasets.length; d++) {
+ Integer pid = probeIndex[p][d];
+ String probeName = null;
+ if (pid != null) {
+ probeName = datasets[d].getProbeList()[pid];
+ }
+ lnout += "\t" + pid + "\t" + probeName;
+ }
+
+ out.writeln(lnout);
+ }
+
+ out.close();
}
private void addEQTL(QTL q) {
@@ -529,10 +600,10 @@ private void writeBuffer(String outdir, int permutation) throws IOException {
+ "Meta-Beta (SE)\t"
+ "Beta (SE)\t"
+ "FoldChange";
-
+
output.writeln(header);
// PValue SNPName SNPChr SNPChrPos ProbeName ProbeChr ProbeCenterChrPos CisTrans SNPType AlleleAssessed OverallZScore DatasetsWhereSNPProbePairIsAvailableAndPassesQC DatasetsZScores DatasetsNrSamples IncludedDatasetsMeanProbeExpression IncludedDatasetsProbeExpressionVariance HGNCName IncludedDatasetsCorrelationCoefficient Meta-Beta (SE) Beta (SE) FoldChange FDR
-
+
DecimalFormat format = new DecimalFormat("###.#######", new DecimalFormatSymbols(Locale.US));
DecimalFormat smallFormat = new DecimalFormat("0.#####E0", new DecimalFormatSymbols(Locale.US));
for (int i = 0; i < settings.getFinalEQTLBufferMaxLength(); i++) {
@@ -580,13 +651,20 @@ private void writeBuffer(String outdir, int permutation) throws IOException {
float[] datasetZScores = q.getDatasetZScores();
String[] dsBuilder = new String[datasets.length];
String[] dsNBuilder = new String[datasets.length];
+ String[] dsZBuilder = new String[datasets.length];
+
for (int d = 0; d < datasetZScores.length; d++) {
+
if (!Float.isNaN(datasetZScores[d])) {
+ String str = format.format(datasetZScores[d]);
+
dsBuilder[d] = settings.getDatasetnames().get(d);
dsNBuilder[d] = "" + q.getDatasetSampleSizes()[d];
+ dsZBuilder[d] = str;
} else {
dsBuilder[d] = "-";
dsNBuilder[d] = "-";
+ dsZBuilder[d] = "-";
}
}
@@ -594,7 +672,7 @@ private void writeBuffer(String outdir, int permutation) throws IOException {
sb.append(Strings.concat(dsBuilder, Strings.semicolon));
sb.append("\t");
- sb.append(Strings.concat(datasetZScores, format, Strings.semicolon));
+ sb.append(Strings.concat(dsZBuilder, Strings.semicolon));
sb.append("\t");
sb.append(Strings.concat(dsNBuilder, Strings.semicolon));
@@ -612,4 +690,11 @@ private void writeBuffer(String outdir, int permutation) throws IOException {
System.out.println(
"Done.");
}
+
+ private void clearResultsBuffer() {
+ Arrays.fill(finalEQTLs, null);
+ bufferHasOverFlown = false;
+ locationToStoreResult = 0;
+ maxSavedPvalue = -Double.MAX_VALUE;
+ }
}
diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java
index 7b9657a3b..4af125e99 100644
--- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java
+++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java
@@ -38,11 +38,16 @@ public class BinaryMetaAnalysisDataset {
private final int platformId;
private RandomAccessFile raf;
- public BinaryMetaAnalysisDataset(String dir, String prefix, int permutation, String platform, MetaQTL4TraitAnnotation probeAnnotation) throws IOException {
+ private String name = null;
+ private String platform = null;
+
+ public BinaryMetaAnalysisDataset(String dir, String name, String prefix, int permutation, String platform, MetaQTL4TraitAnnotation probeAnnotation) throws IOException {
dir = Gpio.formatAsDirectory(dir);
String matrix = dir;
String probeFile = dir;
String snpFile = dir;
+ this.platform = platform;
+ this.name = name;
this.probeAnnotation = probeAnnotation;
this.platformId = probeAnnotation.getPlatformId(platform);
String pref = "Dataset";
@@ -265,4 +270,12 @@ public void close() throws IOException {
raf.close();
}
+ public String getName() {
+ return name;
+ }
+
+ public String getPlatform() {
+ return platform;
+ }
+
}
diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java
index 8737b2175..e00e376da 100644
--- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java
+++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java
@@ -67,8 +67,9 @@ public MetaQTL4TraitAnnotation(File probeAnnotationFile, Set platformsTo
}
}
-
int probeCounter = 0;
+
+ // parse lines
for (String[] elems : tf.readLineElemsIterable(TextFile.tab)) {
String metaTraitName = elems[0];
@@ -89,12 +90,14 @@ public MetaQTL4TraitAnnotation(File probeAnnotationFile, Set platformsTo
}
String hugo = elems[4];
+
String[] platformIds = new String[nrPlatforms];
// int metaTraitId, String metaTraitName, String chr, int chrStart, int chrEnd, String annotation, String[] platformIds
+
MetaQTL4MetaTrait metaTraitObj = new MetaQTL4MetaTrait(probeCounter, metaTraitName, chr, chrstartpos, chrendpos, hugo, platformIds);
+ platformNr = 0;
for (int i = 5; i < elems.length; i++) {
- platformNr = 0;
if (colsToInclude[i]) {
platformIds[platformNr] = elems[i];
HashMap probeToId = traitHashPerPlatform.get(platformNr);
@@ -102,6 +105,7 @@ public MetaQTL4TraitAnnotation(File probeAnnotationFile, Set platformsTo
platformNr++;
}
}
+
probeCounter++;
metatraits.add(metaTraitObj);
metaTraitNameToObj.put(metaTraitName, metaTraitObj);
@@ -118,6 +122,10 @@ public MetaQTL4MetaTrait getTraitForPlatformId(Integer platformId, String platfo
return traitHashPerPlatform.get(platformId).get(platformTrait);
}
+ public HashMap getTraitHashForPlatform(Integer platformId) {
+ return traitHashPerPlatform.get(platformId);
+ }
+
public String[] getPlatforms() {
return platforms;
}
diff --git a/Genotype-Harmonizer/nb-configuration.xml b/Genotype-Harmonizer/nb-configuration.xml
index 5f8c56a87..4c7dcce1c 100644
--- a/Genotype-Harmonizer/nb-configuration.xml
+++ b/Genotype-Harmonizer/nb-configuration.xml
@@ -12,4 +12,13 @@ Without this configuration present, some functionality in the IDE may be limited
+
+
+ JDK_1.7
+
diff --git a/Genotype-Harmonizer/pom.xml b/Genotype-Harmonizer/pom.xml
index d5142e475..915a595c5 100644
--- a/Genotype-Harmonizer/pom.xml
+++ b/Genotype-Harmonizer/pom.xml
@@ -7,14 +7,14 @@
4.0.0Genotype-Harmonizer
- 1.4.12-SNAPSHOT
+ 1.4.16-SNAPSHOTGenotype Harmonizerjarnl.systemsgeneticsGenotype-IO
- 1.0.1
+ 1.0.2-SNAPSHOTcommons-cli
@@ -108,6 +108,15 @@
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 2.3.2
+
+ 1.7
+ 1.7
+
+
diff --git a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java
index 27153622f..1a68bf6cb 100644
--- a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java
+++ b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java
@@ -1,7 +1,6 @@
package nl.umcg.deelenp.genotypeharmonizer;
import static JSci.maths.ArrayMath.covariance;
-import static JSci.maths.ArrayMath.variance;
import com.google.common.collect.Lists;
import java.io.BufferedWriter;
import java.io.File;
@@ -69,21 +68,21 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA
//In this loop we filter the variants present in the reference and swap the AG, AC, TC, TG SNPs.
studyVariants:
for (ModifiableGeneticVariant studyVariant : aligendStudyData.getModifiableGeneticVariants()) {
-
+
++iterationCounter;
if (iterationCounter % 10000 == 0) {
//LOGGER.info("Iteration 1 - " + GenotypeHarmonizer.DEFAULT_NUMBER_FORMATTER.format(iterationCounter) + " variants processed");
System.out.println("Iteration 1 - " + GenotypeHarmonizer.DEFAULT_NUMBER_FORMATTER.format(iterationCounter) + " variants processed");
}
-
+
if (!studyVariant.isMapped()) {
snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "No mapping");
studyVariant.exclude();
continue studyVariants;
}
-
- if(studyVariant.getStartPos() == 0){
+
+ if (studyVariant.getStartPos() == 0) {
snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "No mapping");
studyVariant.exclude();
continue studyVariants;
@@ -176,19 +175,19 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA
//If we get here we have found a variant is our reference data on the same position with comparable alleles.
- //We have to exclude maf of zero otherwise we cannot do LD calculation
- if (!(studyVariant.getMinorAlleleFrequency() > 0)) {
- snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in study data");
- studyVariant.exclude();
- continue studyVariants;
- }
-
- //We have to exclude maf of zero otherwise we can not do LD calculation
- if (!(refVariant.getMinorAlleleFrequency() > 0)) {
- snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in reference data");
- studyVariant.exclude();
- continue studyVariants;
- }
+// //We have to exclude maf of zero otherwise we cannot do LD calculation
+// if (!(studyVariant.getMinorAlleleFrequency() > 0)) {
+// snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in study data");
+// studyVariant.exclude();
+// continue studyVariants;
+// }
+//
+// //We have to exclude maf of zero otherwise we can not do LD calculation
+// if (!(refVariant.getMinorAlleleFrequency() > 0)) {
+// snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in reference data");
+// studyVariant.exclude();
+// continue studyVariants;
+// }
@@ -238,8 +237,8 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA
if (updateId) {
snpUpdateWriter.close();
}
-
- if(iterationCounter == 0){
+
+ if (iterationCounter == 0) {
throw new GenotypeAlignmentException("No variants where found in the input genotype data. Please check your variant filter options");
}
@@ -283,7 +282,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA
if (!studyVariant.isAtOrGcSnp()) {
//Correlate the haps with both these snps between study and ref
- correlationResults hapCor = correlateHaplotypes(minLdToIncludeAlign,
+ CorrelationResults hapCor = correlateHaplotypes(minLdToIncludeAlign,
flankSnpsToConsider, studyVariantList, refVariantList,
variantIndex, studyVariant, refVariant);
@@ -345,7 +344,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA
++GcAtSnpsEncountered;
//Correlate the haps with both these snps between study and ref
- correlationResults hapCor = correlateHaplotypes(minLdToIncludeAlign,
+ CorrelationResults hapCor = correlateHaplotypes(minLdToIncludeAlign,
flankSnpsToConsider, studyVariantList, refVariantList,
variantIndex, studyVariant, refVariant);
@@ -396,7 +395,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA
//Ld pattern should be okay now. but we are going to do the extra check
//Correlate the haps with both these snps between study and ref
- correlationResults hapCorSwapped = correlateHaplotypes(minLdToIncludeAlign,
+ CorrelationResults hapCorSwapped = correlateHaplotypes(minLdToIncludeAlign,
flankSnpsToConsider, studyVariantList, refVariantList,
variantIndex, studyVariant, refVariant);
@@ -445,7 +444,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA
}
- private correlationResults correlateHaplotypes(double minLdToIncludeAlignBase,
+ private CorrelationResults correlateHaplotypes(double minLdToIncludeAlignBase,
int flankSnpsToConsider,
ArrayList studyVariantList,
ArrayList refVariantList, int variantIndex,
@@ -488,36 +487,12 @@ private correlationResults correlateHaplotypes(double minLdToIncludeAlignBase,
ldStudy = LdCalculator.calculateLd(snpStudyVariant, otherSnpStudyVariant);
ldRef = LdCalculator.calculateLd(refVariant, otherRefVariant);
} catch (LdCalculatorException e) {
- LOGGER.warn("Error in LD calculation, skipping this comparison when comparing haplotype structure. Following error occurred: " + e.getMessage());
+ LOGGER.debug("Error in LD calculation, skipping this comparison when comparing haplotype structure. Following error occurred: " + e.getMessage());
continue;
}
-// if(snpStudyVariant.getPrimaryVariantId().equals("rs1001945")){
-// LOGGER.debug(" * Other variant: " + otherSnpStudyVariant.getPrimaryVariantId() +
-// "\nstudy alleles: " + otherSnpStudyVariant.getVariantAlleles() + " ref alleles: " + otherRefVariant.getVariantAlleles() + "\n"
-// + "maf study: " + otherSnpStudyVariant.getMinorAlleleFrequency() + "(" + otherSnpStudyVariant.getMinorAllele() + ") maf ref: " + otherRefVariant.getMinorAlleleFrequency() + "(" + otherRefVariant.getMinorAllele() + ")\n" +
-// "LD study, R2: " + ldStudy.getR2() + " D': " + ldStudy.getDPrime() + "\n" +
-// "LD ref, R2: " + ldRef.getR2() + " D': " + ldRef.getDPrime() + "\n");
-//
-//
-// StringBuilder s = new StringBuilder();
-// for(byte b : snpStudyVariant.getSampleCalledDosages()){
-// s.append(b);
-// }
-// LOGGER.debug(s);
-//
-// s = new StringBuilder();
-// for(byte b : otherSnpStudyVariant.getSampleCalledDosages()){
-// s.append(b);
-// }
-// LOGGER.debug(s);
-//
-//
-//
-//
-// }
//only use SNPs with min R2 in both study as ref
- if (ldStudy.getR2() >= minLdToIncludeAlignBase && ldRef.getR2() >= minLdToIncludeAlignBase) {
+ if ( !Double.isNaN(ldStudy.getR2()) && !Double.isNaN(ldRef.getR2()) && ldStudy.getR2() >= minLdToIncludeAlignBase && ldRef.getR2() >= minLdToIncludeAlignBase) {
//Put in tree map to sort haplotypes. This can differ in the case of different reference allele
TreeMap studyHapFreq = new TreeMap(ldStudy.getHaplotypesFreq());
@@ -542,14 +517,14 @@ private correlationResults correlateHaplotypes(double minLdToIncludeAlignBase,
++posCor;
}
- }
+ }
}
}
- return new correlationResults(posCor, negCor);
+ return new CorrelationResults(posCor, negCor);
}
private double[] createDoubleArrayFromCollection(
@@ -567,12 +542,12 @@ private double[] createDoubleArrayFromCollection(
return array;
}
- private static class correlationResults {
+ private static class CorrelationResults {
private final int posCor;
private final int negCor;
- public correlationResults(int posCor, int negCor) {
+ public CorrelationResults(int posCor, int negCor) {
super();
this.posCor = posCor;
this.negCor = negCor;
diff --git a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java
index 29d70adf1..8ce8b3cdc 100644
--- a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java
+++ b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java
@@ -282,7 +282,7 @@ public GenotypeHarmonizerParamaters(String... args) throws ParseException {
try {
if (commandLine.hasOption('I')) {
- inputType = RandomAccessGenotypeDataReaderFormats.valueOf(commandLine.getOptionValue('I').toUpperCase());
+ inputType = RandomAccessGenotypeDataReaderFormats.valueOfSmart(commandLine.getOptionValue('I').toUpperCase());
} else {
if (inputBasePaths[0].endsWith(".vcf")) {
throw new ParseException("Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file.");
diff --git a/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java b/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java
index 4fc4fa4c7..a4d338bac 100644
--- a/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java
+++ b/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java
@@ -115,7 +115,7 @@ public void testMain() throws Exception {
}
- assertEquals(variantCounter, 3745);
+ assertEquals(variantCounter, 3747);
//Check if ID is updated based on 1000G
assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400");
@@ -183,7 +183,7 @@ public void testMain2() throws Exception {
}
- assertEquals(variantCounter, 4086);
+ assertEquals(variantCounter, 4088);
//Check if number of samples is correct
assertEquals(aligenedHapmap3Data.getSamples().size(), 165);
@@ -252,7 +252,7 @@ public void testMain3() throws Exception {
}
- assertEquals(variantCounter, 4086);
+ assertEquals(variantCounter, 4088);
//Check if ID is updated based on 1000G
assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400");
@@ -358,7 +358,7 @@ public void testMain5() throws Exception {
}
- assertEquals(variantCounter, 3778);
+ assertEquals(variantCounter, 3780);
//Check if ID is updated based on 1000G
assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400");
@@ -423,7 +423,7 @@ public void testMain6() throws Exception {
}
- assertEquals(variantCounter, 3745);
+ assertEquals(variantCounter, 3747);
//Check if ID is updated based on 1000G
assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400");
@@ -486,7 +486,7 @@ public void testMain7() throws Exception {
}
- assertEquals(variantCounter, 4078);
+ assertEquals(variantCounter, 4087);
//Check if number of samples is correct
assertEquals(aligenedHapmap3Data.getSamples().size(), 155);
diff --git a/Genotype-IO/pom.xml b/Genotype-IO/pom.xml
index da4b14682..d071df3b4 100644
--- a/Genotype-IO/pom.xml
+++ b/Genotype-IO/pom.xml
@@ -82,6 +82,8 @@
2.3.2UTF-8
+ 1.7
+ 1.7
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java b/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java
index 8770c6bc7..6335d7538 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java
@@ -6,15 +6,14 @@
import org.molgenis.genotype.variant.GeneticVariant;
import org.molgenis.genotype.variantFilter.VariantFilter;
-public abstract class AbstractRandomAccessGenotypeData extends AbstractGenotypeData implements RandomAccessGenotypeData
-{
+public abstract class AbstractRandomAccessGenotypeData extends AbstractGenotypeData implements RandomAccessGenotypeData {
+
+ private HashMap fullVariantMap = null;
+
@Override
- public Sequence getSequenceByName(String name)
- {
- for (Sequence sequence : getSequences())
- {
- if (sequence.getName().equals(name))
- {
+ public Sequence getSequenceByName(String name) {
+ for (Sequence sequence : getSequences()) {
+ if (sequence.getName().equals(name)) {
return sequence;
}
}
@@ -23,14 +22,11 @@ public Sequence getSequenceByName(String name)
}
@Override
- public GeneticVariant getSnpVariantByPos(String seqName, int startPos)
- {
+ public GeneticVariant getSnpVariantByPos(String seqName, int startPos) {
Iterable variants = getVariantsByPos(seqName, startPos);
- for (GeneticVariant variant : variants)
- {
- if (variant.isSnp())
- {
+ for (GeneticVariant variant : variants) {
+ if (variant.isSnp()) {
// only one SNP possible per position. Returning this SNP only
return variant;
}
@@ -42,59 +38,63 @@ public GeneticVariant getSnpVariantByPos(String seqName, int startPos)
@Override
public HashMap getVariantIdMap() {
- return getVariantIdMap(null);
+
+ if (fullVariantMap == null) {
+ fullVariantMap = getVariantIdMap(null);
+ }
+ return fullVariantMap;
+
}
+ @Override
+ public void clearVariantIdMap(){
+ fullVariantMap = null;
+ }
+
@Override
public HashMap getVariantIdMap(VariantFilter filter) {
-
+
HashMap variantIdMap = new HashMap();
-
- for(GeneticVariant variant : this){
- if( variant.getVariantId().getPrimairyId() != null && !variant.getPrimaryVariantId().equals("") && (filter == null || filter.doesVariantPassFilter(variant))){
+
+ for (GeneticVariant variant : this) {
+ if (variant.getVariantId().getPrimairyId() != null && !variant.getPrimaryVariantId().equals("") && (filter == null || filter.doesVariantPassFilter(variant))) {
variantIdMap.put(variant.getPrimaryVariantId(), variant);
}
}
-
+
return variantIdMap;
-
+
}
@Override
- public Iterator iterator()
- {
+ public Iterator iterator() {
return new GeneticVariantsIterator(this);
}
- private static class GeneticVariantsIterator implements Iterator
- {
+ private static class GeneticVariantsIterator implements Iterator {
+
private Iterator seqNames;
private Iterator seqGeneticVariants;
private RandomAccessGenotypeData randomAccessGenotypeData;
- public GeneticVariantsIterator(RandomAccessGenotypeData randomAccessGenotypeData)
- {
+ public GeneticVariantsIterator(RandomAccessGenotypeData randomAccessGenotypeData) {
seqNames = randomAccessGenotypeData.getSeqNames().iterator();
seqGeneticVariants = randomAccessGenotypeData.getSequenceGeneticVariants(seqNames.next()).iterator();
this.randomAccessGenotypeData = randomAccessGenotypeData;
}
@Override
- public boolean hasNext()
- {
+ public boolean hasNext() {
return seqGeneticVariants.hasNext() || seqNames.hasNext();
}
@Override
- public GeneticVariant next()
- {
- if (seqGeneticVariants.hasNext())
- {
+ public GeneticVariant next() {
+ if (seqGeneticVariants.hasNext()) {
return seqGeneticVariants.next();
}
- if (seqNames.hasNext())
- {
+ if (seqNames.hasNext()) {
seqGeneticVariants = randomAccessGenotypeData.getSequenceGeneticVariants(seqNames.next()).iterator();
return seqGeneticVariants.next();
}
@@ -103,12 +103,8 @@ public GeneticVariant next()
}
@Override
- public void remove()
- {
+ public void remove() {
throw new UnsupportedOperationException();
}
-
-
-
}
}
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java
index 68309c1fe..2cdccd06f 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java
@@ -81,6 +81,11 @@ public interface RandomAccessGenotypeData extends GenotypeData {
*/
HashMap getVariantIdMap(VariantFilter filter);
+ /**
+ * Variant ID map without filter is saved as cache, use this function to clear this cache.
+ */
+ void clearVariantIdMap();
+
/**
* Get a HashMap with the variants that have a primairy ID.
*
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java
index 37c1ef10d..bafbfd8bb 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java
@@ -310,6 +310,10 @@ public static RandomAccessGenotypeDataReaderFormats valueOfSmart(String value){
return PLINK_BED;
} else if (value.equals("B_PLINK")){
return PLINK_BED;
+ } else if (value.equals("PLINKB")){
+ return PLINK_BED;
+ } else if (value.equals("PLINK_B")){
+ return PLINK_BED;
}
return RandomAccessGenotypeDataReaderFormats.valueOf(value);
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java
index a79b9836d..963a9dd2f 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java
@@ -143,7 +143,7 @@ private void writeBimBedFile(File bimFile, File bedFile) throws IOException {
continue;
}
- bimFileWriter.append(variant.getSequenceName());
+ bimFileWriter.append(FormatPlinkChr.formatChr(variant.getSequenceName()));
bimFileWriter.append(SEPARATOR);
bimFileWriter.append(variant.getPrimaryVariantId() == null ? variant.getSequenceName() + ":" + variant.getStartPos() : variant.getPrimaryVariantId());
bimFileWriter.append(SEPARATOR);
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/FormatPlinkChr.java b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/FormatPlinkChr.java
new file mode 100644
index 000000000..92bcbbe70
--- /dev/null
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/FormatPlinkChr.java
@@ -0,0 +1,31 @@
+package org.molgenis.genotype.plink;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ *
+ * @author Patrick Deelen
+ */
+public class FormatPlinkChr {
+
+ private static final Pattern CHR_PATTERN = Pattern.compile("^chr(.*)$", Pattern.CASE_INSENSITIVE);
+
+ public static String formatChr(String chrName){
+
+ Matcher chrMatcher = CHR_PATTERN.matcher(chrName);
+ if (chrMatcher.find()) {
+ chrName = chrMatcher.group(1);
+ }
+
+ switch(chrName){
+ case "X": return "23";
+ case "Y": return "24";
+ case "XY": return "25";
+ case "MT": return "26";
+ default: return chrName;
+ }
+
+ }
+
+}
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java
index d6c091cb7..d5ec45c7b 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java
@@ -76,7 +76,7 @@ private void writeMapFile(File mapFile) throws IOException {
continue;
}
- mapFileWriter.append(variant.getSequenceName());
+ mapFileWriter.append(FormatPlinkChr.formatChr(variant.getSequenceName()));
mapFileWriter.append(SEPARATOR);
mapFileWriter.append(variant.getPrimaryVariantId() == null ? variant.getSequenceName() + ":" + variant.getStartPos() : variant.getPrimaryVariantId());
mapFileWriter.append(SEPARATOR);
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java
index bd99b0e99..ce13c2c99 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java
@@ -42,8 +42,13 @@ public void write(String path) {
for (GeneticVariant variant : genotypeData) {
- dosageWriter.append(variant.getPrimaryVariantId());
- genotypeWriter.append(variant.getPrimaryVariantId());
+ String variantId = variant.getPrimaryVariantId();
+ if(variantId == null){
+ variantId = variant.getSequenceName() + ":" + variant.getStartPos();
+ }
+
+ dosageWriter.append(variantId);
+ genotypeWriter.append(variantId);
for (float dosage : variant.getSampleDosages()) {
dosageWriter.append('\t');
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java
index 4ce5477c7..00178a2e9 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java
@@ -120,7 +120,7 @@ public TriTyperGenotypeData(File location, int cacheSize, VariantFilter variantF
}
public TriTyperGenotypeData(File location, int cacheSize, VariantFilter variantFilter, SampleFilter sampleFilter) throws IOException {
- this(new File(location, "GenotypeMatrix.dat"), new File(location, "ImputedDosageMatrix.dat").exists() ? new File(location, "ImputedDosageMatrix.dat") : null, new File(location, "SNPs.txt.gz").exists() ? new File(location, "SNPs.txt.gz") : new File(location, "SNPs.txt"), new File(location, "SNPMappings.txt.gz").exists() ? new File(location, "SNPMappings.txt.gz") : new File(location, "SNPMappings.txt"), new File(location, "Individuals.txt.gz").exists() ? new File(location, "Individuals.txt.gz") : new File(location, "Individuals.txt"), new File(location, "PhenotypeInformation.txt.gz").exists() ? new File(location, "PhenotypeInformation.txt.gz") : new File(location, "PhenotypeInformation.txt"), cacheSize, variantFilter, sampleFilter, new File(location, "allelRecodingInformation.txt").exists() ? new File(location, "allelRecodingInformation.txt") : null);
+ this(new File(location, "GenotypeMatrix.dat"), new File(location, "ImputedDosageMatrix.dat").exists() ? new File(location, "ImputedDosageMatrix.dat") : null, new File(location, "SNPs.txt.gz").exists() ? new File(location, "SNPs.txt.gz") : new File(location, "SNPs.txt"), new File(location, "SNPMappings.txt.gz").exists() ? new File(location, "SNPMappings.txt.gz") : new File(location, "SNPMappings.txt"), new File(location, "Individuals.txt.gz").exists() ? new File(location, "Individuals.txt.gz") : new File(location, "Individuals.txt"), new File(location, "PhenotypeInformation.txt.gz").exists() ? new File(location, "PhenotypeInformation.txt.gz") : new File(location, "PhenotypeInformation.txt"), cacheSize, variantFilter, sampleFilter, new File(location, "AlleleRecodingInformation.txt").exists() ? new File(location, "AlleleRecodingInformation.txt") : null);
}
public TriTyperGenotypeData(File genotypeDataFile, File imputedDosageDataFile, File snpFile, File snpMapFile, File individualFile, File phenotypeAnnotationFile, int cacheSize, VariantFilter variantFilter, SampleFilter sampleFilter, File allelRecoding) throws IOException {
@@ -345,6 +345,9 @@ private void loadSNPAnnotation(GeneticVariantRange.GeneticVariantRangeCreate snp
String line;
while ((line = snpFileReader.readLine()) != null) {
+ if(allSNPHash.contains(line)){
+ throw new GenotypeDataException("SNP found twice: " + line + ". All SNP ID's must be unique");
+ }
if (variantFilter == null || variantFilter.doesIdPassFilter(line)) {
allSNPHash.put(line, unfilteredSnpCount);
}
@@ -459,11 +462,17 @@ public List getSampleVariants(GeneticVariant variant) {
try {
genotypeHandle.seek(indexLong);
if (genotypeHandle.read(buffer) != buffer.length) {
+
+ LOG.fatal("ERROR loading trityper SNP: " + variant.getPrimaryVariantId() + " at: " + variant.getSequenceName() + ":" + variant.getStartPos() + " variant index: " + index);
+
throw new GenotypeDataException("Could not read bytes from: " + indexLong + " in genotype file " + genotypeDataFile.getAbsolutePath() + " (size: " + genotypeDataFile.length() + ")");
}
} catch (IOException e) {
- throw new GenotypeDataException("Could not read bytes from: " + indexLong + " in genotype file " + genotypeDataFile.getAbsolutePath() + " (size: " + genotypeDataFile.length() + ")");
+
+ LOG.fatal("ERROR loading trityper SNP: " + variant.getPrimaryVariantId() + " at: " + variant.getSequenceName() + ":" + variant.getStartPos() + " variant index: " + index);
+
+ throw new GenotypeDataException("Could not read bytes from: " + indexLong + " in genotype file " + genotypeDataFile.getAbsolutePath() + " (size: " + genotypeDataFile.length() + ")", e);
}
List alleles = new ArrayList(includedSamples.size());
diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java
index a8fa0905c..f52fabcc6 100644
--- a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java
+++ b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java
@@ -15,6 +15,7 @@
import org.molgenis.genotype.Sample;
import org.molgenis.genotype.variant.GeneticVariant;
import org.molgenis.genotype.variant.NotASnpException;
+import org.molgenis.genotype.variant.id.GeneticVariantId;
/**
*
@@ -48,7 +49,7 @@ public void write(File folder) throws IOException {
File snpMapFile = new File(folder, "SNPMappings.txt");
File individualFile = new File(folder, "Individuals.txt");
File phenotypeAnnotationFile = new File(folder, "PhenotypeInformation.txt");
- File allelRecodingFile = new File(folder, "allelRecodingInformation.txt");
+ File allelRecodingFile = new File(folder, "AlleleRecodingInformation.txt");
writeSnps(snpFile, snpMapFile);
writeSamples(individualFile, phenotypeAnnotationFile);
@@ -70,14 +71,16 @@ private void writeSnps(File snpFile, File snpMapFile) throws IOException {
// continue;
// }
- snpFileWriter.append(variant.getPrimaryVariantId());
+ final String snpName = createTriTyperVariantId(variant);
+
+ snpFileWriter.append(snpName);
snpFileWriter.append('\n');
snpMapFileWriter.append(variant.getSequenceName());
snpMapFileWriter.append('\t');
snpMapFileWriter.append(String.valueOf(variant.getStartPos()));
snpMapFileWriter.append('\t');
- snpMapFileWriter.append(variant.getPrimaryVariantId());
+ snpMapFileWriter.append(snpName);
snpMapFileWriter.append('\n');
}
@@ -147,7 +150,7 @@ private void writeGenotypes(File genotypeDataFile, File imputedDosageDataFile, F
a = sampleAlleles.get(0).isSnpAllele() && sampleAlleles.get(0) != Allele.ZERO ? (byte) sampleAlleles.get(0).getAlleleAsSnp() : 0;
b = sampleAlleles.get(1).isSnpAllele() && sampleAlleles.get(1) != Allele.ZERO ? (byte) sampleAlleles.get(1).getAlleleAsSnp() : 0;
} else {
- snpRecodingInfo.add(variant.getPrimaryVariantId()+"\t"+variant.getSequenceName()+"\t"+variant.getStartPos()+"\t"+variant.getVariantAlleles().get(0)+"\t"+variant.getVariantAlleles().get(1));
+ snpRecodingInfo.add(createTriTyperVariantId(variant)+"\t"+variant.getSequenceName()+"\t"+variant.getStartPos()+"\t"+variant.getVariantAlleles().get(0)+"\t"+variant.getVariantAlleles().get(1));
if(sampleAlleles.get(0).equals(variant.getVariantAlleles().get(0))){
a = (byte) 'A';
@@ -196,7 +199,7 @@ private void writeGenotypes(File genotypeDataFile, File imputedDosageDataFile, F
if(!snpRecodingInfo.isEmpty()){
BufferedWriter allelRecodingFileWriter = new BufferedWriter(new FileWriter(allelRecodingFile));
- allelRecodingFileWriter.write("Variant_ID\tchr\tpos\tAllel1\tAllel2\n");
+ allelRecodingFileWriter.write("Variant_ID\tChr\tPos\tAllele1\tAllele2\n");
for(String s : snpRecodingInfo){
allelRecodingFileWriter.write(s+"\n");
}
@@ -204,4 +207,21 @@ private void writeGenotypes(File genotypeDataFile, File imputedDosageDataFile, F
allelRecodingFileWriter.close();
}
}
+
+ private String createTriTyperVariantId(GeneticVariant variant) {
+ final GeneticVariantId snpId = variant.getVariantId();
+ String snpName;
+ if(snpId.containsId()){
+ snpName = snpId.getPrimairyId();
+ } else {
+ snpName = variant.getSequenceName() + ':' + String.valueOf(variant.getStartPos());
+ if(!variant.isSnp()){
+ for(Allele allele : variant.getVariantAlleles()){
+ snpName = snpName + "_" + allele.getAlleleAsString();
+ }
+
+ }
+ }
+ return snpName;
+ }
}
diff --git a/cellTypeSpecificAlleleSpecificExpression/README.md b/cellTypeSpecificAlleleSpecificExpression/README.md
index 297b20d61..51fa77d78 100644
--- a/cellTypeSpecificAlleleSpecificExpression/README.md
+++ b/cellTypeSpecificAlleleSpecificExpression/README.md
@@ -1,3 +1 @@
-# Cell type specific Allele specific Expression
-
-please see the [wiki](https://github.com/adriaan-vd-graaf/systemsgenetics/wiki/ASE) for full documentation on usage and internal mechanisms.
\ No newline at end of file
+Please see the [wiki](https://github.com/adriaan-vd-graaf/systemsgenetics/wiki/ASE) for full documentation on usage and internal mechanisms.
diff --git a/eQTLInteractionAnalyser/pom.xml b/eQTLInteractionAnalyser/pom.xml
new file mode 100644
index 000000000..b2f231ebb
--- /dev/null
+++ b/eQTLInteractionAnalyser/pom.xml
@@ -0,0 +1,92 @@
+
+
+ 4.0.0
+
+ nl.systemsgenetics
+ systemsgenetics
+ 1.0.2-SNAPSHOT
+
+ nl.systemsgenetics
+ eQTLInteractionAnalyser
+ 1.1-SNAPSHOT
+ eQTLInteractionAnalyser
+ http://maven.apache.org
+
+ UTF-8
+
+
+
+ net.sf.jsci
+ jsci
+ 1.2
+
+
+ org.apache.commons
+ commons-math3
+ 3.2
+
+
+ net.sourceforge.parallelcolt
+ parallelcolt
+ 0.10.0
+
+
+ gov.nist.math.jama
+ gov.nist.math.jama
+ 1.1.1
+
+
+ junit
+ junit
+ 3.8.1
+ test
+
+
+ nl.systemsgenetics
+ genetica-libraries
+ 1.0.7-SNAPSHOT
+
+
+ com.opencsv
+ opencsv
+ 3.4
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 2.3.2
+
+ UTF-8
+
+
+
+ maven-assembly-plugin
+
+
+ jar-with-dependencies
+
+
+
+ nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.EQTLInteractionAnalyser
+ true
+ true
+
+
+
+
+
+ package
+
+ single
+
+
+
+
+
+
+
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/CompareToGeuvadis.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/CompareToGeuvadis.java
new file mode 100644
index 000000000..799d77c2f
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/CompareToGeuvadis.java
@@ -0,0 +1,79 @@
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import java.util.HashSet;
+import java.util.Map;
+
+/**
+ *
+ * @author Patrick Deelen
+ */
+public class CompareToGeuvadis {
+
+ /**
+ * @param args the command line arguments
+ */
+ public static void main(String[] args) {
+
+ ExpressionDataset bios = new ExpressionDataset("/Volumes/Promise_RAID/lude/InteractionZScoresMatrix-4Covariates.txt.binary");
+ ExpressionDataset geuvadis = new ExpressionDataset("/Volumes/Promise_RAID/projects/BBMRI/interactionsGeuvadisRegressOut/InteractionZScoresMatrix-9Covariates.txt.binary");
+
+ HashSet covariatesReplicated = new HashSet();
+ HashSet genesReplicated = new HashSet();
+ int interactionsReplicated = 0;
+ int sameDirection = 0;
+ int oppositeDirection = 0;
+
+ for (Map.Entry covariateEntry : bios.hashProbes.entrySet()) {
+ for (Map.Entry eQtlGeneEntry : bios.hashSamples.entrySet()) {
+
+ String covariate = covariateEntry.getKey();
+ String eQtlGene = eQtlGeneEntry.getKey();
+
+// if(!covariate.equals("ENSG00000084072")){
+// continue;
+// }
+
+ double biosInteractionZ = bios.rawData[covariateEntry.getValue()][eQtlGeneEntry.getValue()];
+
+ if (biosInteractionZ >= 6 || biosInteractionZ <= -6) {
+
+ Integer geuvadisCovI = geuvadis.hashProbes.get(covariate);
+ Integer geuvadisGenI = geuvadis.hashSamples.get(eQtlGene);
+
+ if (geuvadisCovI != null && geuvadisGenI != null) {
+
+ double geuvadisInteractionZ = geuvadis.rawData[geuvadisCovI][geuvadisGenI];
+
+ if (geuvadisInteractionZ >= 5 || geuvadisInteractionZ <= -5) {
+
+ covariatesReplicated.add(covariate);
+ genesReplicated.add(eQtlGene);
+ interactionsReplicated++;
+
+ if(biosInteractionZ * geuvadisInteractionZ > 0){
+ sameDirection++;
+ } else {
+ oppositeDirection++;
+ }
+
+ System.out.println(covariate + "\t" + eQtlGene + "\t" + biosInteractionZ + "\t" + geuvadisInteractionZ);
+
+ }
+
+ }
+
+
+ }
+
+ }
+ }
+
+ System.out.println("Covariates replicated: " + covariatesReplicated.size());
+ System.out.println("Genes replicated: " + genesReplicated.size());
+ System.out.println("Interactions replicated: " + interactionsReplicated);
+ System.out.println("Interactions replicated same: " + sameDirection);
+ System.out.println("Interactions replicated opposite: " + oppositeDirection);
+
+ }
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObject.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObject.java
new file mode 100644
index 000000000..6438d9215
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObject.java
@@ -0,0 +1,21 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+/**
+ *
+ * @author ludefranke
+ */
+public class DoubleArrayIntegerObject {
+
+ public double[] doubleArray;
+ public int intValue;
+ public DoubleArrayIntegerObject(double[] doubleArray, int intValue) {
+ this.doubleArray = doubleArray;
+ this.intValue = intValue;
+ }
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObjectSorter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObjectSorter.java
new file mode 100644
index 000000000..3e639d144
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObjectSorter.java
@@ -0,0 +1,29 @@
+/*
+ * GeneLocationObjectSorter.java
+ *
+ * Created on 23 December 2003, 17:14
+ */
+
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+/**
+ *
+ * @author Like
+ */
+public class DoubleArrayIntegerObjectSorter extends VectorSorter {
+
+ /** Creates a new instance of GeneLocationObjectSorter */
+ public DoubleArrayIntegerObjectSorter() {
+ super();
+ }
+
+ /** Override object comparer
+ * @param a the first GeneLocationObject to be compared
+ * @param b the second GeneLocationObject to be compared
+ * @return true if the first GeneLocationObject.getChrStart() is lower than the second one
+ */
+ protected boolean lt (Object a, Object b) {
+ return (((DoubleArrayIntegerObject)a).intValue < ((DoubleArrayIntegerObject)b).intValue);
+ }
+
+}
\ No newline at end of file
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/EQTLInteractionAnalyser.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/EQTLInteractionAnalyser.java
new file mode 100644
index 000000000..aa7e62ad3
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/EQTLInteractionAnalyser.java
@@ -0,0 +1,352 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import java.io.*;
+
+import org.apache.commons.cli.*;
+import umcg.genetica.io.text.TextFile;
+
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.HashMap;
+
+/**
+ *
+ * @author lude
+ */
+public class EQTLInteractionAnalyser {
+
+ private static final DateFormat DATE_TIME_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ private static final Date currentDataTime = new Date();
+ private static final Options OPTIONS;
+
+ static {
+
+ OPTIONS = new Options();
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArgs();
+ OptionBuilder.withDescription("Path to the folder containing expression and genotype data");
+ OptionBuilder.withLongOpt("input");
+ OptionBuilder.isRequired();
+ OPTIONS.addOption(OptionBuilder.create("i"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Path to the output folder");
+ OptionBuilder.withLongOpt("output");
+ OptionBuilder.isRequired();
+ OPTIONS.addOption(OptionBuilder.create("o"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Path to the eQTL file to test for interactions");
+ OptionBuilder.withLongOpt("eqtls");
+ OPTIONS.addOption(OptionBuilder.create("e"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Path to the eQTL file to correct covariates");
+ OptionBuilder.withLongOpt("eqtlsCovariates");
+ OPTIONS.addOption(OptionBuilder.create("ec"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Path to the gene annotation file in the format of eQTL mapping pipeline");
+ OptionBuilder.withLongOpt("annot");
+ OPTIONS.addOption(OptionBuilder.create("a"));
+
+ OptionBuilder.withArgName("int");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Maximum number of covariates to regress out");
+ OptionBuilder.withLongOpt("maxcov");
+ OPTIONS.addOption(OptionBuilder.create("n"));
+
+ OptionBuilder.withDescription("Interpret the z-score matrices");
+ OptionBuilder.withLongOpt("interpret");
+ OPTIONS.addOption(OptionBuilder.create("it"));
+
+ OptionBuilder.withDescription("Run permutation");
+ OptionBuilder.withLongOpt("permute");
+ OPTIONS.addOption(OptionBuilder.create("perm"));
+
+ OptionBuilder.withDescription("Find chi2sum differences for each covariate between 2 consequtive interaction runs");
+ OptionBuilder.withLongOpt("chi2sumDiff");
+ OPTIONS.addOption(OptionBuilder.create("dif"));
+
+ OptionBuilder.withArgName("int");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Start round for chi2sumDiff option");
+ OptionBuilder.withLongOpt("start");
+ OPTIONS.addOption(OptionBuilder.create("s"));
+
+ OptionBuilder.withDescription("Preprocess the data");
+ OptionBuilder.withLongOpt("preprocess");
+ OPTIONS.addOption(OptionBuilder.create("p"));
+
+ OptionBuilder.withDescription("Convert matrix");
+ OptionBuilder.withLongOpt("convertMatrix");
+ OPTIONS.addOption(OptionBuilder.create("cm"));
+
+ OptionBuilder.withDescription("Skip all normalization step. n must be 1");
+ OptionBuilder.withLongOpt("noNormalization");
+ OPTIONS.addOption(OptionBuilder.create("nn"));
+
+ OptionBuilder.withDescription("Skip covariate normalization step. n must be 1");
+ OptionBuilder.withLongOpt("noCovNormalization");
+ OPTIONS.addOption(OptionBuilder.create("ncn"));
+
+ OptionBuilder.withArgName("strings");
+ OptionBuilder.hasArgs();
+ OptionBuilder.withDescription("covariates to correct for using an interaction term before running the interaction analysis");
+ OptionBuilder.withLongOpt("cov");
+ OPTIONS.addOption(OptionBuilder.create("c"));
+
+ OptionBuilder.withArgName("strings");
+ OptionBuilder.hasArgs();
+ OptionBuilder.withDescription("Covariates to correct for without interaction term before running the interaction analysis");
+ OptionBuilder.withLongOpt("cov2");
+ OPTIONS.addOption(OptionBuilder.create("c2"));
+
+ OptionBuilder.withArgName("strings");
+ OptionBuilder.hasArgs();
+ OptionBuilder.withDescription("Covariates to correct for without interaction term before running the interaction analysis");
+ OptionBuilder.withLongOpt("cohorts");
+ OPTIONS.addOption(OptionBuilder.create("ch"));
+
+ OptionBuilder.withArgName("strings");
+ OptionBuilder.hasArgs();
+ OptionBuilder.withDescription("Covariates to to test in interaction analysis. Optional, all are tested if not used");
+ OptionBuilder.withLongOpt("covTest");
+ OPTIONS.addOption(OptionBuilder.create("ct"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("File containing the covariates to correct for using an interaction term before running the interaction analysis. No header, each covariate on a separate line");
+ OptionBuilder.withLongOpt("covFile");
+ OPTIONS.addOption(OptionBuilder.create("cf"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("File containing the SNPs to swap");
+ OptionBuilder.withLongOpt("swap");
+ OPTIONS.addOption(OptionBuilder.create("sw"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Included samples");
+ OptionBuilder.withLongOpt("includedSamples");
+ OPTIONS.addOption(OptionBuilder.create("is"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Gene annotation file");
+ OptionBuilder.withLongOpt("geneAnnotation");
+ OPTIONS.addOption(OptionBuilder.create("ga"));
+
+ OptionBuilder.withArgName("int");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Number of threads");
+ OptionBuilder.withLongOpt("threads");
+ OPTIONS.addOption(OptionBuilder.create("nt"));
+
+ OptionBuilder.withArgName("int");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("Z-score difference threshold for interpretation");
+ OptionBuilder.withLongOpt("threshold");
+ OPTIONS.addOption(OptionBuilder.create("thr"));
+
+ OptionBuilder.withArgName("path");
+ OptionBuilder.hasArg();
+ OptionBuilder.withDescription("SNPs to test");
+ OptionBuilder.withLongOpt("snpsToTest");
+ OPTIONS.addOption(OptionBuilder.create("snps"));
+ }
+
+ public static void main(String[] args) throws IOException, Exception {
+ System.out.println("Starting interaction analysis");
+ System.out.println("Current date and time: " + DATE_TIME_FORMAT.format(currentDataTime));
+ System.out.println();
+
+ String inputDir, outputDir, eqtlFile = null, annotationFile = null;
+ final File snpsToSwapFile;
+ int maxNumCovariatesToRegress = 20;
+ int numThreads;
+ final boolean interpret, chi2sumDiff, permute, preproces;
+ final int startRoundCompareChi2, threshold;
+
+ HashMap hashSamples;
+
+ final String[] covariates;
+ final String[] covariates2;
+ final String[] cohorts;
+ final String[] covariatesToTest;
+ final File ensgAnnotationFile;
+ final File snpsToTestFile;
+ final boolean skipNormalization;
+ final boolean skipCovariateNormalization;
+ final boolean convertMatrix;
+ final String eqtlFileCovariates;
+
+ try {
+ final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, false);
+
+ inputDir = commandLine.getOptionValue("i");
+ outputDir = commandLine.getOptionValue("o");
+
+ if (commandLine.hasOption('e')) {
+ eqtlFile = commandLine.getOptionValue("e");
+ }
+
+
+ eqtlFileCovariates = commandLine.getOptionValue("ec", null);
+
+ if (commandLine.hasOption('n')) {
+ maxNumCovariatesToRegress = Integer.parseInt(commandLine.getOptionValue("n"));
+ }
+ if (commandLine.hasOption("thr")) {
+ threshold = Integer.parseInt(commandLine.getOptionValue("thr"));
+ }
+ else {
+ threshold = 3;
+ }
+
+
+ interpret = commandLine.hasOption("it");
+ chi2sumDiff = commandLine.hasOption("dif");
+ permute = commandLine.hasOption("perm");
+ preproces = commandLine.hasOption("p");
+ convertMatrix = commandLine.hasOption("cm");
+
+ if (commandLine.hasOption('s')) {
+ startRoundCompareChi2 = Integer.parseInt(commandLine.getOptionValue("s"));
+ } else if(chi2sumDiff){
+ throw new Exception("Set -s");
+ } else {
+ startRoundCompareChi2 = 0;
+ }
+
+ if (commandLine.hasOption('a')) {
+ annotationFile = commandLine.getOptionValue("a");
+ }
+
+ if (commandLine.hasOption("cf")) {
+ TextFile covFile = new TextFile(commandLine.getOptionValue("cf"), false);
+ covariates = covFile.readAsArray();
+ covFile.close();
+ }
+ else if (commandLine.hasOption("c")){
+ covariates = commandLine.getOptionValues("c");
+ } else {
+ covariates = new String[0];
+ }
+
+ if (commandLine.hasOption("c2")){
+ covariates2 = commandLine.getOptionValues("c2");
+ } else {
+ covariates2 = new String[0];
+ }
+
+ if (commandLine.hasOption("ch")){
+ cohorts = commandLine.getOptionValues("ch");
+ } else {
+ cohorts = null;
+ }
+
+ if (commandLine.hasOption("ct")){
+ covariatesToTest = commandLine.getOptionValues("ct");
+ } else {
+ covariatesToTest = null;
+ }
+
+ if (commandLine.hasOption("sw")){
+ snpsToSwapFile = new File(commandLine.getOptionValue("sw"));
+ } else {
+ snpsToSwapFile = null;
+ }
+
+ if (commandLine.hasOption("snps")){
+ snpsToTestFile = new File(commandLine.getOptionValue("snps"));
+ } else {
+ snpsToTestFile = null;
+ }
+
+ skipNormalization = commandLine.hasOption("nn");
+ if(skipNormalization && maxNumCovariatesToRegress != 1){
+ System.err.println("n must be one if normalization is turned off");
+ System.exit(-1);
+ }
+
+ skipCovariateNormalization = commandLine.hasOption("ncn");
+ if(skipCovariateNormalization && maxNumCovariatesToRegress != 1){
+ System.err.println("n must be one if covariate normalization is turned off");
+ System.exit(-1);
+ }
+
+ if (commandLine.hasOption("is")){
+ File samplesToIncludeFile = new File(commandLine.getOptionValue("is"));
+ System.out.println("Samples to include file: " + samplesToIncludeFile.getAbsolutePath());
+ hashSamples = new HashMap();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(samplesToIncludeFile), "UTF-8"));
+ String line;
+ while ((line = reader.readLine()) != null) {
+ hashSamples.put(line, null);
+ hashSamples.put(line + "_exp", null);
+ hashSamples.put(line + "_dosage", null);
+ }
+ } else {
+ hashSamples = null;
+ }
+
+
+ if (commandLine.hasOption("ga")){
+ ensgAnnotationFile = new File(commandLine.getOptionValue("ga"));
+ } else {
+ ensgAnnotationFile = null;
+ }
+ if (commandLine.hasOption("nt")) {
+ numThreads = Integer.parseInt(commandLine.getOptionValue("nt"));
+ } else {
+ numThreads = Runtime.getRuntime().availableProcessors();
+ }
+
+ } catch (ParseException ex) {
+ System.err.println("Invalid command line arguments: ");
+ System.err.println(ex.getMessage());
+ System.err.println();
+ new HelpFormatter().printHelp(" ", OPTIONS);
+ System.exit(1);
+ return;
+ }
+
+ if(preproces){
+ TestEQTLDatasetForInteractions interactor = new TestEQTLDatasetForInteractions(inputDir, outputDir);
+ interactor.preprocessData();
+ } else if (interpret){
+ TestEQTLDatasetForInteractions interactor = new TestEQTLDatasetForInteractions(inputDir, outputDir);
+ interactor.interpretInteractionZScoreMatrix(maxNumCovariatesToRegress, startRoundCompareChi2, threshold);
+ }
+ else if (chi2sumDiff){
+ TestEQTLDatasetForInteractions interactor = new TestEQTLDatasetForInteractions(inputDir, outputDir);
+ interactor.findChi2SumDifferences(maxNumCovariatesToRegress, startRoundCompareChi2, ensgAnnotationFile);
+ } else if (convertMatrix){
+ System.out.println("input file: " + inputDir);
+ System.out.println("output file: " + outputDir);
+ if(inputDir.equals(outputDir)){
+ System.err.println("input == output");
+ System.exit(1);
+ }
+ new ExpressionDataset(inputDir).save(outputDir);
+ }
+ else {
+ new TestEQTLDatasetForInteractions(inputDir, outputDir, eqtlFile, maxNumCovariatesToRegress, annotationFile, covariates, covariates2, snpsToSwapFile, permute, covariatesToTest, hashSamples, numThreads, cohorts, snpsToTestFile, skipNormalization, skipCovariateNormalization, eqtlFileCovariates);
+ }
+ }
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/ExpressionDataset.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/ExpressionDataset.java
new file mode 100644
index 000000000..b446ebe19
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/ExpressionDataset.java
@@ -0,0 +1,583 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+import java.awt.image.BufferedImage;
+import java.awt.image.*;
+import java.awt.*;
+import java.awt.geom.*;
+import java.lang.Math;
+import javax.imageio.*;
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ *
+ * @author lude
+ */
+public class ExpressionDataset {
+
+ public double[][] rawData = null;
+ public int nrSamples = 0;
+ public int nrProbes = 0;
+ public String[] probeNames = null;
+ public String[] sampleNames = null;
+ public HashMap hashSamples = new HashMap();
+ public HashMap hashProbes = new HashMap();
+ private HashMap hashProbesToInclude = null;
+ private HashMap hashSamplesToInclude = null;
+ public String fileName = null;
+
+ public ExpressionDataset(String fileName) {
+ if (fileName.endsWith(".binary")) {
+ loadExpressionDataInBinaryFormat(fileName);
+ } else {
+ loadExpressionData(fileName, '\t');
+ }
+ }
+
+ public ExpressionDataset(String fileName, char delimiter) {
+ if (fileName.endsWith(".binary")) {
+ loadExpressionDataInBinaryFormat(fileName);
+ } else {
+ loadExpressionData(fileName, delimiter);
+ }
+ }
+
+ public ExpressionDataset(String fileName, char delimiter, HashMap hashProbesToInclude) {
+ this.hashProbesToInclude = hashProbesToInclude;
+ if (fileName.endsWith(".binary")) {
+ loadExpressionDataInBinaryFormat(fileName);
+ } else {
+ loadExpressionData(fileName, delimiter);
+ }
+ }
+
+ public ExpressionDataset(String fileName, char delimiter, HashMap hashProbesToInclude, HashMap hashSamplesToInclude) {
+ this.hashProbesToInclude = hashProbesToInclude;
+ this.hashSamplesToInclude = hashSamplesToInclude;
+ if (fileName.endsWith(".binary")) {
+ loadExpressionDataInBinaryFormat(fileName);
+ } else {
+ loadExpressionData(fileName, delimiter);
+ }
+ }
+
+ public ExpressionDataset(int nrProbes, int nrSamples) {
+ this.nrProbes = nrProbes;
+ this.nrSamples = nrSamples;
+ sampleNames = new String[nrSamples];
+ for (int s=0; s2 && data[1].length() > 0 && data[1].equals("MultipleHits")) {
+ dataIsInTriTyperFormat = true;
+ sampleOffset = 9;
+
+ }
+
+ if (hashSamplesToInclude==null) {
+ nrSamples = data.length - sampleOffset;
+ sampleNames = new String[nrSamples];
+ sampleIndex = new int[nrSamples];
+ for (int s=0; s> 56);
+ buffer[bufferLoc + 1] = (byte) (bits >> 48 & 0xff);
+ buffer[bufferLoc + 2] = (byte) (bits >> 40 & 0xff);
+ buffer[bufferLoc + 3] = (byte) (bits >> 32 & 0xff);
+ buffer[bufferLoc + 4] = (byte) (bits >> 24 & 0xff);
+ buffer[bufferLoc + 5] = (byte) (bits >> 16 & 0xff);
+ buffer[bufferLoc + 6] = (byte) (bits >> 8 & 0xff);
+ buffer[bufferLoc + 7] = (byte) (bits & 0xff);
+ bufferLoc += 8;
+ }
+ try {
+ out.write(buffer);
+ } catch (IOException e) {
+ System.err.println("Can't write to " + fileBinary.getName() + ": " + e.getMessage());
+ System.exit(1);
+ }
+ }
+ try {
+ out.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ File fileProbes = new File(fileName + ".rows.txt");
+ try {
+ java.io.BufferedWriter outProbes = new java.io.BufferedWriter(new java.io.FileWriter(fileProbes));
+ for (int p=0; p>> 24),
+ (byte) (value >>> 16),
+ (byte) (value >>> 8),
+ (byte) value};
+ }
+
+ private int byteArrayToInt(byte[] b) {
+ return (b[0] << 24)
+ + ((b[1] & 0xff) << 16)
+ + ((b[2] & 0xff) << 8)
+ + (b[3] & 0xff);
+ }
+
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/GeneAnnotation.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/GeneAnnotation.java
new file mode 100644
index 000000000..d38d252d9
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/GeneAnnotation.java
@@ -0,0 +1,43 @@
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+/**
+ *
+ * @author Patrick Deelen
+ */
+public class GeneAnnotation {
+
+ private final String ensg;
+ private final String huho;
+ private final String chr;
+ private final int start;
+ private final int end;
+
+ public GeneAnnotation(String ensg, String huho, String chr, int start, int end) {
+ this.ensg = ensg;
+ this.huho = huho;
+ this.chr = chr;
+ this.start = start;
+ this.end = end;
+ }
+
+ public String getEnsg() {
+ return ensg;
+ }
+
+ public String getHuho() {
+ return huho;
+ }
+
+ public String getChr() {
+ return chr;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public int getEnd() {
+ return end;
+ }
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/InteractionPlotter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/InteractionPlotter.java
new file mode 100644
index 000000000..950953e8a
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/InteractionPlotter.java
@@ -0,0 +1,574 @@
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import au.com.bytecode.opencsv.CSVReader;
+import au.com.bytecode.opencsv.CSVWriter;
+import java.awt.Color;
+import java.awt.Graphics2D;
+import java.awt.RenderingHints;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import static nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.TestEQTLDatasetForInteractions.getEqtls;
+import static nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.TestEQTLDatasetForInteractions.getLinearRegressionCoefficients;
+import static nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.TestEQTLDatasetForInteractions.orthogonalizeDataset;
+import org.apache.commons.math3.stat.ranking.NaturalRanking;
+
+/**
+ *
+ * @author Patrick Deelen
+ */
+public class InteractionPlotter {
+
+ static String inputDir = null;
+ static String outputDir = null;
+
+ /**
+ * @param args the command line arguments
+ */
+ public static void main(String[] args) throws IOException {
+
+ //makeInteractionPlot("D:\\tmp\\test.png", new double[]{0,0,0,0.2,1,1,1,1,2,2,2}, new double[]{5,4,3,0.2,8,12,6,7,23,5,7}, new double[]{3,2,1,0.2,2,6,4,6,20,2,5});
+
+ inputDir = args[0];
+ outputDir = args[1];
+ String eQTLfileName = args[2];
+ String covariate = args[3];
+ File genesFile = new File(args[4]);
+
+
+
+ System.out.println("Input dir: " + inputDir);
+ System.out.println("Output dir: " + outputDir);
+ System.out.println("eQTL file: " + eQTLfileName);
+ System.out.println("covariate: " + covariate);
+ System.out.println("genes file: " + genesFile.getAbsolutePath());
+
+ String[] covsToCorrect = {"gender", "GC", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "CEU", "GBR", "FIN", "TSI", "YRI"};
+ //String[] covsToCorrect = {"age", "gender", "GC", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "LLdeep", "LLS", "RS", "CODAM"};
+ HashMap hashEQTLs = getEqtls(eQTLfileName);
+
+ HashMap hashSamples = new HashMap();
+
+ if (1 == 1) {
+
+ System.out.println("Removing outlier samples!!!");
+ HashMap hashCovariates = new HashMap();
+ hashCovariates.put("MEDIAN_5PRIME_BIAS", null);
+ hashCovariates.put("MEDIAN_3PRIME_BIAS", null);
+ ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, null);
+ hashSamples = new HashMap();
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ if (datasetCovariates.rawData[0][s] != 0) {
+ hashSamples.put(datasetCovariates.sampleNames[s], null);
+ }
+ }
+ datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, hashSamples);
+ HashMap hashSamplesToExclude = new HashMap();
+ if (1 == 1) {
+ int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_5PRIME_BIAS")).intValue();
+ double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]);
+ double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]);
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ double z = (datasetCovariates.rawData[index][s] - mean) / stdev;
+ if (Math.abs(z) > 3) {
+ hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null);
+ }
+ }
+ }
+ if (1 == 1) {
+ int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_3PRIME_BIAS")).intValue();
+ double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]);
+ double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]);
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ double z = (datasetCovariates.rawData[index][s] - mean) / stdev;
+ if (Math.abs(z) > 3) {
+ hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null);
+ }
+ }
+ }
+ hashSamples = new HashMap();
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ if (!hashSamplesToExclude.containsKey(datasetCovariates.sampleNames[s])) {
+ hashSamples.put(datasetCovariates.sampleNames[s], null);
+ hashSamples.put(datasetCovariates.sampleNames[s] + "_exp", null);
+ hashSamples.put(datasetCovariates.sampleNames[s] + "_dosage", null);
+ }
+ }
+ }
+
+ ExpressionDataset datasetGenotypes = new ExpressionDataset(inputDir + "/bigTableLude.txt.Genotypes.binary", '\t', hashEQTLs, hashSamples);
+ ExpressionDataset datasetExpression = new ExpressionDataset(inputDir + "/bigTableLude.txt.Expression.binary", '\t', hashEQTLs, hashSamples);
+ ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', null, hashSamples);
+
+ org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression();
+ int nrSamples = datasetGenotypes.nrSamples;
+
+
+ if (1 == 1) {
+ //Define a set of covariates that we want to use as correction:
+ System.out.println("Correcting gene expression data for cohort specific effects and top 25 components");
+ //String[] cohorts = {"LLDeep", "LLS", "RS", "CODAM"};
+ int nrCompsToCorrectFor = 25;
+ ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(nrCompsToCorrectFor, datasetGenotypes.nrSamples);
+ datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames;
+// for (int p = 0; p < cohorts.length; p++) {
+// for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+// if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) {
+// datasetCovariatesToCorrectFor.rawData[p][s] = 1;
+// }
+// }
+// }
+ if (nrCompsToCorrectFor > 0) {
+ for (int comp = 0; comp < nrCompsToCorrectFor; comp++) {
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesToCorrectFor.rawData[comp][s] = datasetCovariates.rawData[datasetCovariates.nrProbes - 51 + comp][s];
+ }
+ }
+ }
+
+ datasetCovariatesToCorrectFor.transposeDataset();
+
+ datasetCovariatesToCorrectFor.save(inputDir + "/CovariatesToCorrectFor.txt");
+ orthogonalizeDataset(inputDir + "/CovariatesToCorrectFor.txt");
+ datasetCovariatesToCorrectFor = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.PrincipalComponents.txt");
+ datasetCovariatesToCorrectFor.transposeDataset();
+ ExpressionDataset datasetCovariatesToCorrectForEigenvalues = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.Eigenvalues.txt");
+ for (int snp = 0; snp < datasetExpression.nrProbes; snp++) {
+ for (int cov = 0; cov < datasetCovariatesToCorrectFor.nrProbes; cov++) {
+ if (datasetCovariatesToCorrectForEigenvalues.rawData[cov][0] > 1E-5) {
+ double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetExpression.rawData[snp]);
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetExpression.rawData[snp][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s];
+ }
+ }
+ }
+ }
+
+
+ }
+
+
+ double[] mainEQTLCorr = new double[datasetGenotypes.nrProbes];
+ if (1 == 1) {
+ System.out.println("Enforcing for every eQTL that the genotype dosage positively correlated with gene expression levels:");
+ for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) {
+ double corr = JSci.maths.ArrayMath.correlation(datasetGenotypes.rawData[snp], datasetExpression.rawData[snp]);
+ //System.out.println(datasetExpression.probeNames[snp] + "\t" + snp + "\t" + corr);
+
+ if (corr < 0) {
+ corr = -corr;
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetGenotypes.rawData[snp][s] = 2 - datasetGenotypes.rawData[snp][s];
+ }
+ }
+
+ mainEQTLCorr[snp] = corr;
+ }
+ }
+
+ if (1 == 1) {
+
+ if (1 == 1) {
+ System.out.println("Correcting covariate data for cohort specific effects:");
+// String[] cohorts = {"LLDeep","LLS","RS","CODAM"};
+ ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect.length, datasetGenotypes.nrSamples);
+ datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames;
+// for (int p=0; p 1E-5) {
+ double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetCovariates.rawData[p]);
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariates.rawData[p][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s];
+ }
+ }
+ }
+ double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[p]);
+ double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[p]);
+ if (stdev < 1E-5) {
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariates.rawData[p][s] = mean;
+ }
+ }
+ }
+ }
+
+
+ }
+
+ if (1 == 1) {
+ System.out.println("Correcting covariate data for cis-eQTL effects:");
+ for (int p = 0; p < datasetCovariates.nrProbes; p++) {
+ if (datasetExpression.hashProbes.containsKey(datasetCovariates.probeNames[p])) {
+ int index = ((Integer) datasetExpression.hashProbes.get(datasetCovariates.probeNames[p])).intValue();
+ double[] rc = getLinearRegressionCoefficients(datasetGenotypes.rawData[index], datasetCovariates.rawData[p]);
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariates.rawData[p][s] -= rc[0] * datasetGenotypes.rawData[index][s];
+ }
+ }
+ }
+ }
+
+ if (1 == 2) {
+ datasetCovariates.save(inputDir + "/CovariatesCorrected.txt");
+ HashMap hashProbesToFilter = new HashMap();
+ for (int p = 0; p < datasetCovariates.nrProbes; p++) {
+ if (datasetCovariates.probeNames[p].startsWith("ENSG")) {
+ hashProbesToFilter.put(datasetCovariates.probeNames[p], null);
+ }
+ }
+ ExpressionDataset datasetCovariatesCorrected = new ExpressionDataset(inputDir + "/CovariatesCorrected.txt", '\t', hashProbesToFilter, null);
+ datasetCovariatesCorrected.transposeDataset();
+ datasetCovariatesCorrected.save(inputDir + "/CovariatesCorrected.txt");
+ System.exit(0);
+ }
+
+ if (1 == 2) {
+ ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/mixingmatrix.txt");
+ //ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/signals.txt");
+ datasetICA.transposeDataset();
+ for (int p = 0; p < datasetICA.nrProbes; p++) {
+ datasetCovariates.rawData[p] = datasetICA.rawData[p];
+ datasetCovariates.probeNames[p] = datasetICA.probeNames[p];
+ if (p == 7) {
+ for (int q = 0; q < datasetCovariates.nrProbes; q++) {
+ double corr = JSci.maths.ArrayMath.correlation(datasetICA.rawData[p], datasetCovariates.rawData[q]);
+ System.out.println(p + "\t" + datasetICA.probeNames[p] + "\t" + q + "\t" + datasetCovariates.probeNames[q] + "\t" + corr + "\t" + corr * corr);
+ }
+ }
+ }
+
+ orthogonalizeDataset("/Users/lude/Documents/ICA/mixingmatrix.txt");
+ //System.exit(0);
+ }
+
+ System.out.println("Enforcing normal distribution on covariates");
+
+ NaturalRanking ranker = new NaturalRanking();
+
+ for (int p = 0; p < datasetCovariates.nrProbes; p++) {
+ //Rank order the expression values:
+ double[] values = new double[datasetCovariates.nrSamples];
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ values[s] = datasetCovariates.rawData[p][s];
+ }
+ double[] rankedValues = ranker.rank(values);
+ //Replace the original expression value with the standard distribution enforce:
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ //Convert the rank to a proportion, with range <0, 1>
+ double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length);
+ //Convert the pValue to a Z-Score:
+ double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue);
+ datasetCovariates.rawData[p][s] = zScore; //Replace original expression value with the Z-Score
+ }
+ }
+
+ }
+
+ cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = new cern.jet.random.tdouble.engine.DRand();
+
+ ExpressionDataset datasetExpressionBeforeEQTLCorrection = new ExpressionDataset(datasetExpression.nrProbes, datasetExpression.nrSamples);
+ for (int p = 0; p < datasetExpression.nrProbes; p++) {
+ for (int s = 0; s < datasetExpression.nrSamples; s++) {
+ datasetExpressionBeforeEQTLCorrection.rawData[p][s] = datasetExpression.rawData[p][s];
+ }
+ }
+
+ if (1 == 1) {
+ System.out.println("Correcting expression data for predefined gene environment interaction effects (GC content, Gender, 5'Median Bias, 3'Median Bias):");
+ int[] covsToCorrectIndex = new int[covsToCorrect.length];
+ for (int c = 0; c < covsToCorrect.length; c++) {
+ covsToCorrectIndex[c] = ((Integer) datasetCovariates.hashProbes.get(covsToCorrect[c])).intValue();
+ }
+ for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) {
+ double[][] valsX = new double[nrSamples][1 + covsToCorrect.length * 2]; //store genotypes, covariates, interactions
+ for (int s = 0; s < nrSamples; s++) {
+ valsX[s][0] = datasetGenotypes.rawData[snp][s]; //genotypes
+ }
+ for (int c = 0; c < covsToCorrect.length; c++) {
+ for (int s = 0; s < nrSamples; s++) {
+ valsX[s][c * 2 + 1] = datasetCovariates.rawData[covsToCorrectIndex[c]][s]; //covariate
+ valsX[s][c * 2 + 2] = valsX[s][0] * valsX[s][c * 2 + 1]; //interction
+ }
+ }
+ double[] valsY = datasetExpression.rawData[snp];
+ regression.newSampleData(valsY, valsX);
+ datasetExpression.rawData[snp] = regression.estimateResiduals();
+ }
+ }
+
+
+ if (1 == 1) {
+ System.out.println("Enforcing normal distribution on expression data:");
+
+ NaturalRanking ranker = new NaturalRanking();
+
+ for (int p = 0; p < datasetExpression.nrProbes; p++) {
+ //Rank order the expression values:
+ double[] values = new double[datasetExpression.nrSamples];
+ for (int s = 0; s < datasetExpression.nrSamples; s++) {
+ values[s] = datasetExpression.rawData[p][s];
+ }
+
+ double[] rankedValues = ranker.rank(values);
+ //Replace the original expression value with the standard distribution enforce:
+ for (int s = 0; s < datasetExpression.nrSamples; s++) {
+ //Convert the rank to a proportion, with range <0, 1>
+ double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length);
+ //Convert the pValue to a Z-Score:
+ double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue);
+ datasetExpression.rawData[p][s] = zScore; //Replace original expression value with the Z-Score
+ }
+ }
+
+ System.out.println("Expression data now force normal");
+
+ }
+
+
+ CSVReader reader = new CSVReader(new FileReader(genesFile), '\t', CSVWriter.NO_QUOTE_CHARACTER);
+ String[] nextLine;
+ while ((nextLine = reader.readNext()) != null) {
+
+ String eQtlGene = nextLine[0];
+
+ System.out.println(eQtlGene);
+
+ Integer eQtlGeneI = datasetExpression.hashProbes.get(eQtlGene);
+ Integer covariateI = datasetCovariates.hashProbes.get(covariate);
+ Integer snpI = eQtlGeneI;
+
+ makeInteractionPlot(outputDir + "/" + covariate + "-" + eQtlGene + ".png", datasetGenotypes.rawData[snpI], datasetExpression.rawData[eQtlGeneI], datasetCovariates.rawData[covariateI]);
+
+ }
+
+ }
+
+ public static void makeInteractionPlot(String fileName, double[] genotype, double[] expression, double[] covariate) {
+
+ int nrSamples = genotype.length;
+
+// int[] cohortIndex = new int[4];
+// String[] cohorts = {"LLDeep", "LLS", "RS", "CODAM"};
+// for (int cohort = 0; cohort < cohorts.length; cohort++) {
+// for (int s = 0; s < nrSamples; s++) {
+// if (sampleNames[s].startsWith(cohorts[cohort])) {
+// cohortIndex[cohort] = s;
+// break;
+// }
+// }
+// }
+
+ int marginLeft = 100;
+ int marginRight = 200;
+ int marginTop = 100;
+ int marginBottom = 100;
+ int innerHeight = 500;
+ int innerWidth = 500;
+ int docWidth = marginLeft + marginRight + innerWidth;
+ int docHeight = marginTop + marginBottom + innerHeight;
+
+ BufferedImage bimage = new BufferedImage(docWidth, docHeight, BufferedImage.TYPE_INT_RGB);
+ Graphics2D g2d = bimage.createGraphics();
+
+ g2d.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
+ g2d.setColor(new Color(255, 255, 255));
+ g2d.fillRect(0, 0, docWidth, docHeight);
+ java.awt.AlphaComposite alphaComposite10 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.10f);
+ java.awt.AlphaComposite alphaComposite25 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.25f);
+ java.awt.AlphaComposite alphaComposite50 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.50f);
+ java.awt.AlphaComposite alphaComposite100 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC, 1.00f);
+
+ float fontSize = 12f;
+ java.awt.Font font = new java.awt.Font("Gill Sans MT", java.awt.Font.PLAIN, (int) fontSize);
+ java.awt.Font fontBold = new java.awt.Font("Gill Sans MT", java.awt.Font.BOLD, (int) fontSize);
+ java.awt.Font fontSmall = new java.awt.Font("Gill Sans MT", java.awt.Font.PLAIN, 8);
+ java.awt.Font fontBoldSmall = new java.awt.Font("Gill Sans MT", java.awt.Font.BOLD, 8);
+
+ java.awt.Color dataColor[] = new Color[10];
+ dataColor[0] = new java.awt.Color(167, 72, 20);
+ dataColor[1] = new java.awt.Color(62, 138, 20);
+ dataColor[2] = new java.awt.Color(228, 171, 0);
+ dataColor[3] = new java.awt.Color(0, 148, 183);
+ dataColor[4] = new java.awt.Color(119, 80, 152);
+ dataColor[5] = new java.awt.Color(106, 106, 106);
+ dataColor[6] = new java.awt.Color(212, 215, 10);
+ dataColor[7] = new java.awt.Color(210, 111, 0);
+ dataColor[8] = new java.awt.Color(0, 0, 141);
+ dataColor[9] = new java.awt.Color(190, 190, 190);
+
+ g2d.setComposite(alphaComposite50);
+ g2d.setColor(new Color(0, 0, 0));
+ g2d.drawLine(marginLeft, marginTop, marginLeft, marginTop + innerHeight);
+ g2d.drawLine(marginLeft, marginTop + innerHeight, marginLeft + innerWidth, marginTop + innerHeight);
+
+ double minX = JSci.maths.ArrayMath.min(covariate);
+ double maxX = JSci.maths.ArrayMath.max(covariate);
+ double minY = JSci.maths.ArrayMath.min(expression);
+ double maxY = JSci.maths.ArrayMath.max(expression);
+
+ g2d.setComposite(alphaComposite10);
+ for (int rep = 0; rep >= 0; rep--) {
+ for (int s = 0; s < nrSamples; s++) {
+ int posY = marginTop + innerHeight - (int) ((expression[s] - minY) / (maxY - minY) * innerHeight);
+ int posX = marginLeft + (int) ((covariate[s] - minX) / (maxX - minX) * innerWidth);
+ if (genotype[s] < 0.5) {
+ g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f));
+ g2d.setColor(new Color(204, 86, 78));
+ } else {
+ if (genotype[s] > 1.5) {
+ g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f));
+ g2d.setColor(new Color(171, 178, 114));
+ } else {
+ g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f));
+ g2d.setColor(new Color(98, 175, 255));
+ }
+ }
+
+ g2d.fillOval(posX - 5 - rep * 4, posY - 5 - rep * 4, 7 + rep * 8, 7 + rep * 8);
+
+ }
+ }
+
+ //Draw the four independent cohorts seperately:
+ //int[] cohortIndex = {0,626,1280,1933};
+// for (int rep = 2; rep >= 0; rep--) {
+// for (int s = 0; s < nrSamples; s++) {
+// int cohort = 0;
+// for (int c = 0; c < cohortIndex.length; c++) {
+// if (s >= cohortIndex[c]) {
+// cohort = c;
+// }
+// }
+//
+// int posY = marginTop + 100 + cohort * 125 - (int) ((expression[s] - minY) / (maxY - minY) * 100);
+// int posX = marginLeft + innerWidth + 50 + (int) ((covariate[s] - minX) / (maxX - minX) * 100);
+// if (genotype[s] < 0.5) {
+// g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f));
+// g2d.setColor(new Color(204, 86, 78));
+// } else {
+// if (genotype[s] > 1.5) {
+// g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f));
+// g2d.setColor(new Color(171, 178, 114));
+// } else {
+// g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f));
+// g2d.setColor(new Color(98, 175, 255));
+// }
+// }
+// g2d.fillOval(posX - 1 - rep * 2, posY - 1 - rep * 2, 3 + rep * 4, 3 + rep * 4);
+//
+// }
+// }
+
+
+ g2d.setComposite(alphaComposite50);
+ double[][] valsX = new double[nrSamples][3];
+ for (int s = 0; s < nrSamples; s++) {
+ valsX[s][0] = genotype[s];
+ valsX[s][1] = covariate[s];
+ valsX[s][2] = valsX[s][0] * valsX[s][1];
+ }
+ double[] valsY = expression;
+ org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression();
+ regression.newSampleData(valsY, valsX);
+ double[] betas = regression.estimateRegressionParameters();
+ double betaInteraction = betas[3];
+ double seInteraction = regression.estimateRegressionParametersStandardErrors()[3];
+ double tInteraction = betaInteraction / seInteraction;
+ double pValueInteraction = 1;
+ double zScoreInteraction = 0;
+ cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = new cern.jet.random.tdouble.engine.DRand();
+ cern.jet.random.tdouble.StudentT tDistColt = new cern.jet.random.tdouble.StudentT(genotype.length - 4, randomEngine);
+ if (tInteraction < 0) {
+ pValueInteraction = tDistColt.cdf(tInteraction);
+ if (pValueInteraction < 2.0E-323) {
+ pValueInteraction = 2.0E-323;
+ }
+ zScoreInteraction = cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction);
+ } else {
+ pValueInteraction = tDistColt.cdf(-tInteraction);
+ if (pValueInteraction < 2.0E-323) {
+ pValueInteraction = 2.0E-323;
+ }
+ zScoreInteraction = -cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction);
+ }
+ pValueInteraction *= 2;
+
+ String pValueString = (new java.text.DecimalFormat("0.##E0", new java.text.DecimalFormatSymbols(java.util.Locale.US))).format(pValueInteraction);
+ if (pValueInteraction > 0.001) {
+ pValueString = (new java.text.DecimalFormat("##.###;-##.###", new java.text.DecimalFormatSymbols(java.util.Locale.US))).format(pValueInteraction);
+ }
+ g2d.setFont(new java.awt.Font("Arial", java.awt.Font.BOLD, 14));
+ g2d.setColor(new Color(0, 0, 0));
+ int posX = marginLeft;
+ int posY = marginTop + innerHeight + 20;
+ g2d.drawString("Interaction P-Value: " + pValueString, posX, posY);
+
+
+ for (int g = 0; g <= 2; g++) {
+
+ double valMin = betas[0] + betas[1] * g + minX * betas[2] + betas[3] * g * minX;
+ double valMax = betas[0] + betas[1] * g + maxX * betas[2] + betas[3] * g * maxX;
+ int posXMin = marginLeft + (int) ((minX - minX) / (maxX - minX) * innerWidth);
+ int posYMin = marginTop + innerHeight - (int) ((valMin - minY) / (maxY - minY) * innerHeight);
+ int posXMax = marginLeft + (int) ((maxX - minX) / (maxX - minX) * innerWidth);
+ int posYMax = marginTop + innerHeight - (int) ((valMax - minY) / (maxY - minY) * innerHeight);
+
+ g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.8f));
+ g2d.setColor(new Color(255, 255, 255));
+ g2d.setStroke(new java.awt.BasicStroke(5.0f, java.awt.BasicStroke.CAP_ROUND, java.awt.BasicStroke.JOIN_ROUND));
+ g2d.drawLine(posXMin, posYMin, posXMax, posYMax);
+ if (g < 0.5) {
+ g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.30f));
+ g2d.setColor(new Color(204, 86, 78));
+ } else {
+ if (g > 1.5) {
+ g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.50f));
+ g2d.setColor(new Color(171, 178, 114));
+ } else {
+ g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.50f));
+ g2d.setColor(new Color(98, 175, 255));
+ }
+ }
+ g2d.setStroke(new java.awt.BasicStroke(3.0f, java.awt.BasicStroke.CAP_ROUND, java.awt.BasicStroke.JOIN_ROUND));
+ g2d.drawLine(posXMin, posYMin, posXMax, posYMax);
+
+ }
+
+ try {
+ javax.imageio.ImageIO.write(bimage, "png", new File(fileName));
+ } catch (IOException e) {
+ System.out.println(e.getMessage());
+ e.printStackTrace();
+ }
+
+
+ }
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/PerformInteractionAnalysisPermutationTask.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/PerformInteractionAnalysisPermutationTask.java
new file mode 100644
index 000000000..f31b5e6f2
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/PerformInteractionAnalysisPermutationTask.java
@@ -0,0 +1,118 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import gnu.trove.set.hash.TIntHashSet;
+import java.util.concurrent.Callable;
+import org.apache.commons.math3.linear.SingularMatrixException;
+import org.apache.commons.math3.stat.regression.SimpleRegression;
+
+/**
+ *
+ * @author lude
+ */
+public class PerformInteractionAnalysisPermutationTask implements Callable {
+
+ public ExpressionDataset datasetGenotypes;
+ public ExpressionDataset datasetExpression;
+ public ExpressionDataset datasetCovariates;
+ ExpressionDataset datasetCovariatesPCAForceNormal;
+ public int covToTest = -1;
+ public int nrSamples = -1;
+ public org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = null;
+ public cern.jet.random.tdouble.StudentT tDistColt = null;
+ private final SkippedInteractionTracker skippedTracker;
+ private final SkippedInteractionWriter skippedWriter;
+ private final TIntHashSet snpsToTest;
+
+ public PerformInteractionAnalysisPermutationTask(ExpressionDataset datasetGenotypes, ExpressionDataset datasetExpression, ExpressionDataset datasetCovariates, ExpressionDataset datasetCovariatesPCAForceNormal, int covToTest, SkippedInteractionWriter skippedWriter, final TIntHashSet snpsToTest) {
+ this.datasetGenotypes = datasetGenotypes;
+ this.datasetExpression = datasetExpression;
+ this.datasetCovariates = datasetCovariates;
+ this.datasetCovariatesPCAForceNormal = datasetCovariatesPCAForceNormal;
+ this.covToTest = covToTest;
+ this.nrSamples = datasetGenotypes.nrSamples;
+ this.skippedTracker = new SkippedInteractionTracker(datasetCovariates.probeNames[covToTest]);
+ this.skippedWriter = skippedWriter;
+ this.snpsToTest = snpsToTest;
+
+ this.regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression();
+ cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = new cern.jet.random.tdouble.engine.DRand();
+ this.tDistColt = new cern.jet.random.tdouble.StudentT(this.nrSamples - 4, randomEngine);
+
+ }
+
+ @Override
+ public DoubleArrayIntegerObject call() throws Exception {
+ double corrPvalueThreshold = 0.0001;
+ double[] zScores = new double[datasetGenotypes.nrProbes];
+ for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) {
+
+ if(snpsToTest != null && !snpsToTest.contains(snp)){
+ continue;
+ }
+
+ double corrPvalue = correlateCovariateWithGenotype(snp);
+ if (corrPvalue > corrPvalueThreshold) { // don't compute the interaction if the covariate expression is affected by theis SNP
+ try {
+
+ double[][] valsX = new double[nrSamples][3];
+ for (int s = 0; s < nrSamples; s++) {
+ valsX[s][0] = datasetGenotypes.rawData[snp][s];
+ valsX[s][1] = datasetCovariates.rawData[covToTest][s];
+ valsX[s][2] = valsX[s][0] * valsX[s][1];
+ }
+ double[] valsY = datasetExpression.rawData[snp];
+ regression.newSampleData(valsY, valsX);
+ double betaInteraction = regression.estimateRegressionParameters()[3];
+ double seInteraction = regression.estimateRegressionParametersStandardErrors()[3];
+ double tInteraction = betaInteraction / seInteraction;
+ double pValueInteraction = 1;
+ double zScoreInteraction = 0;
+ if (tInteraction < 0) {
+ pValueInteraction = tDistColt.cdf(tInteraction);
+ if (pValueInteraction < 2.0E-323) {
+ pValueInteraction = 2.0E-323;
+ }
+ zScoreInteraction = cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction);
+ } else {
+ pValueInteraction = tDistColt.cdf(-tInteraction);
+ if (pValueInteraction < 2.0E-323) {
+ pValueInteraction = 2.0E-323;
+ }
+ zScoreInteraction = -cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction);
+ }
+ zScores[snp] = zScoreInteraction;
+ } catch (SingularMatrixException e) {
+ zScores[snp] = 0;
+ skippedTracker.addSkipped(SkippedInteractionTracker.Reason.SINGULAR, datasetGenotypes.probeNames[snp]);
+ }
+ }
+ else{
+ //System.out.println("Removing covariate because of eQTL effect! " + datasetCovariatesPCAForceNormal.probeNames[covToTest] + " : " + datasetGenotypes.probeNames[snp]);
+ skippedTracker.addSkipped(SkippedInteractionTracker.Reason.SHARED_QTL, datasetGenotypes.probeNames[snp]);
+ zScores[snp] = 0;
+ }
+
+ }
+ skippedWriter.add(skippedTracker);
+ return new DoubleArrayIntegerObject(zScores, covToTest);
+ }
+
+ private double correlateCovariateWithGenotype(int snp){
+ SimpleRegression simpleRegression = new SimpleRegression();
+ double[] expression = datasetCovariatesPCAForceNormal.rawData[covToTest];
+ double[] genotypes = datasetGenotypes.rawData[snp];
+ for (int s = 0; s < expression.length; s++) {
+ simpleRegression.addData(expression[s], genotypes[s]);
+ }
+ //This is not working now that we have the _rs next to the gene names
+// if (datasetGenotypes.probeNames[snp].equals(datasetCovariatesPCAForceNormal.probeNames[covToTest])){
+// System.out.println("Same gene! " + datasetGenotypes.probeNames[snp] + "\t" + datasetCovariatesPCAForceNormal.probeNames[covToTest] + "\t" + simpleRegression.getSignificance() + "\t" + simpleRegression.getR());
+// }
+ return simpleRegression.getSignificance();
+ }
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionTracker.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionTracker.java
new file mode 100644
index 000000000..6c55604df
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionTracker.java
@@ -0,0 +1,40 @@
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumMap;
+
+/**
+ *
+ * @author Patrick Deelen
+ */
+public class SkippedInteractionTracker {
+
+ public static enum Reason {
+ SINGULAR, SHARED_QTL
+ }
+
+ private final String covariate;
+ EnumMap> skipped;
+
+ public SkippedInteractionTracker(String covariate) {
+ this.covariate = covariate;
+ skipped = new EnumMap>(Reason.class);
+ for(Reason r : Reason.values()){
+ skipped.put(r, new ArrayList());
+ }
+ }
+
+ public void addSkipped(Reason r, String qtl){
+ skipped.get(r).add(qtl);
+ }
+
+ public String getCovariate() {
+ return covariate;
+ }
+
+ public EnumMap> getSkipped() {
+ return skipped;
+ }
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionWriter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionWriter.java
new file mode 100644
index 000000000..abaa52563
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionWriter.java
@@ -0,0 +1,65 @@
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import au.com.bytecode.opencsv.CSVWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ *
+ * @author Patrick Deelen
+ */
+public class SkippedInteractionWriter {
+
+ private final CSVWriter writer;
+ private final String[] row = new String[5];
+ private int c;
+ private StringBuilder tmp;
+
+ public SkippedInteractionWriter(File skippedInteractionsFile) throws IOException {
+ writer = new CSVWriter(new FileWriter(skippedInteractionsFile), '\t', CSVWriter.NO_QUOTE_CHARACTER);
+
+ c = 0;
+ row[c++] = "Covariate";
+ row[c++] = "CountSingular";
+ row[c++] = "CountSharedQtl";
+ row[c++] = "SingularQtls";
+ row[c++] = "SharedQtls";
+
+ writer.writeNext(row);
+ }
+
+ public void close() throws IOException{
+ writer.close();
+ }
+
+ synchronized void add(SkippedInteractionTracker skipped){
+
+ ArrayList singular = skipped.getSkipped().get(SkippedInteractionTracker.Reason.SINGULAR);
+ ArrayList sharedQtl = skipped.getSkipped().get(SkippedInteractionTracker.Reason.SHARED_QTL);
+
+ c = 0;
+ row[c++] = skipped.getCovariate();
+ row[c++] = String.valueOf(singular.size());
+ row[c++] = String.valueOf(sharedQtl.size());
+
+ tmp = new StringBuilder();
+ for(String qtl : singular){
+ tmp.append(qtl);
+ tmp.append(';');
+ }
+ row[c++] = tmp.toString();
+
+ tmp = new StringBuilder();
+ for(String qtl : sharedQtl){
+ tmp.append(qtl);
+ tmp.append(';');
+ }
+ row[c++] = tmp.toString();
+
+ writer.writeNext(row);
+
+ }
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObject.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObject.java
new file mode 100644
index 000000000..07016fac6
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObject.java
@@ -0,0 +1,21 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+/**
+ *
+ * @author ludefranke
+ */
+public class StringIntegerObject {
+
+ public String stringValue;
+ public int intValue;
+ public StringIntegerObject(String stringValue, int intValue) {
+ this.stringValue = stringValue;
+ this.intValue = intValue;
+ }
+
+}
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObjectSorter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObjectSorter.java
new file mode 100644
index 000000000..bb25906f8
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObjectSorter.java
@@ -0,0 +1,36 @@
+/*
+ * GeneLocationObjectSorter.java
+ *
+ * Created on 23 December 2003, 17:14
+ */
+
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+/**
+ *
+ * @author Like
+ */
+public class StringIntegerObjectSorter extends VectorSorter {
+
+ private java.text.Collator collatorUS = null;
+
+ /** Creates a new instance of GeneLocationObjectSorter */
+ public StringIntegerObjectSorter() {
+ super();
+ collatorUS = java.text.Collator.getInstance(java.util.Locale.US);
+ }
+
+ /** Override object comparer
+ * @param a the first GeneLocationObject to be compared
+ * @param b the second GeneLocationObject to be compared
+ * @return true if the first GeneLocationObject.getChrStart() is lower than the second one
+ */
+ protected boolean lt (Object a, Object b) {
+ if (collatorUS.compare(((StringIntegerObject)a).stringValue, ((StringIntegerObject)b).stringValue) >= 0) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/TestEQTLDatasetForInteractions.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/TestEQTLDatasetForInteractions.java
new file mode 100644
index 000000000..9611fa474
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/TestEQTLDatasetForInteractions.java
@@ -0,0 +1,1540 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+import au.com.bytecode.opencsv.CSVReader;
+import au.com.bytecode.opencsv.CSVWriter;
+import com.google.common.collect.HashMultimap;
+import gnu.trove.set.hash.TIntHashSet;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.Vector;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Executors;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.math3.exception.MathIllegalArgumentException;
+import org.apache.commons.math3.stat.descriptive.moment.Variance;
+import org.apache.commons.math3.stat.ranking.NaturalRanking;
+import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression;
+import org.apache.mahout.math.Arrays;
+import umcg.genetica.genomicboundaries.GenomicBoundary;
+import umcg.genetica.io.Gpio;
+import umcg.genetica.io.text.TextFile;
+import umcg.genetica.io.trityper.EQTL;
+import umcg.genetica.io.trityper.QTLTextFile;
+
+/**
+ *
+ * @author lude
+ */
+public class TestEQTLDatasetForInteractions {
+
+ String inputDir = null;
+ String outputDir = null;
+ HashMap> geneDistanceMap = null;
+ String[] primaryCovsToCorrect;
+ ExpressionDataset datasetGenotypes;
+
+ public TestEQTLDatasetForInteractions(String inputDir, String outputDir) throws IOException {
+
+ this.inputDir = inputDir;
+ this.outputDir = outputDir;
+ primaryCovsToCorrect = new String[]{"gender", "GC", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "LLdeep", "RS", "CODAM", "LLS"};
+ //preprocessData();
+ }
+
+ public TestEQTLDatasetForInteractions(String inputDir, String outputDir, String eQTLfileName, int maxNumTopCovs, String annotationFile, String[] covariatesToCorrect, String[] covariatesToCorrect2, File snpsToSwapFile, boolean permute, String[] covariatesToTest, HashMap hashSamples, int numThreads, String[] cohorts, File snpsToTestFile, boolean skipNormalization, boolean skipCovariateNormalization, String eQTLfileNameCovariates) throws IOException, Exception {
+
+ System.out.println("Input dir: " + inputDir);
+ System.out.println("Output dir: " + outputDir);
+ System.out.println("eQTL file: " + eQTLfileName);
+ System.out.println("eQTL file covariates: " + eQTLfileNameCovariates);
+ System.out.println("Maximum number of covariates to regress out: " + maxNumTopCovs);
+ System.out.println("Covariates to correct for with interaction: " + Arrays.toString(covariatesToCorrect));
+ System.out.println("Covariates to correct for without interaction: " + Arrays.toString(covariatesToCorrect2));
+ if (covariatesToTest != null) {
+ System.out.println("Covariates to test: " + Arrays.toString(covariatesToTest));
+ }
+
+ this.inputDir = inputDir;
+ this.outputDir = outputDir;
+ primaryCovsToCorrect = covariatesToCorrect;
+ if (!Gpio.exists(outputDir)) {
+ Gpio.createDir(outputDir);
+ }
+
+ initGenotypes(permute, hashSamples, cohorts);
+
+ final HashMultimap qtlProbeSnpMultiMap = HashMultimap.create();
+ if (eQTLfileName != null) {
+ final QTLTextFile eQtlFileReader = new QTLTextFile(eQTLfileName, false);
+ for (Iterator it = eQtlFileReader.getEQtlIterator(); it.hasNext();) {
+ EQTL qtl = it.next();
+ qtlProbeSnpMultiMap.put(qtl.getProbe(), qtl.getRsName());
+ }
+ }
+
+ final HashMultimap qtlProbeSnpMultiMapCovariates;
+ if(eQTLfileNameCovariates != null){
+ qtlProbeSnpMultiMapCovariates = HashMultimap.create();
+ final QTLTextFile eQtlFileReader = new QTLTextFile(eQTLfileNameCovariates, false);
+ for (Iterator it = eQtlFileReader.getEQtlIterator(); it.hasNext();) {
+ EQTL qtl = it.next();
+ qtlProbeSnpMultiMapCovariates.put(qtl.getProbe(), qtl.getRsName());
+ }
+ } else {
+ qtlProbeSnpMultiMapCovariates = qtlProbeSnpMultiMap;
+ }
+
+ if (annotationFile != null) {
+ createGeneDistanceMap(annotationFile);
+ }
+
+ final TIntHashSet snpsToTest;
+ if (snpsToTestFile != null) {
+
+ snpsToTest = new TIntHashSet();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(snpsToTestFile), "UTF-8"));
+
+ String line;
+ while ((line = reader.readLine()) != null) {
+ Integer genotypeI = datasetGenotypes.hashProbes.get(line);
+
+ if (genotypeI == null) {
+ System.out.println("SNP " + line + " not found in genotype data");
+ continue;
+ }
+
+ if (!snpsToTest.add(genotypeI)) {
+ System.out.println("Warning including SNP twice: " + line);
+ }
+
+ }
+
+ System.out.println("Confining testing to: " + snpsToTest.size() + " SNPs from: " + snpsToTestFile.getAbsolutePath());
+
+ } else {
+ snpsToTest = null;
+ }
+
+ //preprocessData();
+
+ TextFile outputTopCovs = new TextFile(outputDir + "/outputTopCovariates.txt", true);
+
+ System.out.print("\nPrimary covariates to correct for before running interaction analysis: ");
+ for (String cov : primaryCovsToCorrect) {
+ System.out.print("\n\t" + cov);
+ }
+ System.out.println();
+
+
+
+ String[] covsToCorrect = primaryCovsToCorrect;
+ int cnt = 0;
+ while (cnt < maxNumTopCovs) {
+ String topCov = performInteractionAnalysis(covsToCorrect, covariatesToCorrect2, outputTopCovs, snpsToSwapFile, qtlProbeSnpMultiMap, covariatesToTest, hashSamples, numThreads, snpsToTest, skipNormalization, skipCovariateNormalization, qtlProbeSnpMultiMapCovariates);
+ String[] covsToCorrectNew = new String[covsToCorrect.length + 1];
+ for (int c = 0; c < covsToCorrect.length; c++) {
+ covsToCorrectNew[c] = covsToCorrect[c];
+ }
+ covsToCorrectNew[covsToCorrect.length] = topCov;
+ covsToCorrect = covsToCorrectNew;
+ cnt++;
+ }
+ outputTopCovs.close();
+ }
+
+ private void initGenotypes(boolean permute, HashMap hashSamples, String[] cohorts) {
+
+ datasetGenotypes = new ExpressionDataset(inputDir + "/bigTableLude.txt.Genotypes.binary", '\t', null, hashSamples);
+
+ if (permute) {
+ System.out.println("WARNING: PERMUTING GENOTYPE DATA!!!!");
+ if (cohorts == null) {
+ cohorts = new String[]{"LLDeep", "LLS", "RS", "CODAM"};
+ }
+ int[] permSampleIDs = new int[datasetGenotypes.nrSamples];
+ for (int p = 0; p < cohorts.length; p++) {
+ Vector vecSamples = new Vector();
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ //if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) {
+ vecSamples.add(s);
+ //}
+ }
+
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ //if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) {
+ int randomSample = ((Integer) vecSamples.remove((int) ((double) vecSamples.size() * Math.random()))).intValue();
+ permSampleIDs[s] = randomSample;
+ //}
+ }
+ }
+
+ ExpressionDataset datasetGenotypes2 = new ExpressionDataset(datasetGenotypes.nrProbes, datasetGenotypes.nrSamples);
+ datasetGenotypes2.probeNames = datasetGenotypes.probeNames;
+ datasetGenotypes2.sampleNames = datasetGenotypes.sampleNames;
+ datasetGenotypes2.recalculateHashMaps();
+ for (int p = 0; p < datasetGenotypes2.nrProbes; p++) {
+ for (int s = 0; s < datasetGenotypes2.nrSamples; s++) {
+ datasetGenotypes2.rawData[p][s] = datasetGenotypes.rawData[p][permSampleIDs[s]];
+ }
+ }
+ datasetGenotypes = datasetGenotypes2;
+ }
+
+ }
+
+ /**
+ * Extracts eQTL gene names
+ *
+ * @param fname - eQTL file (in the eqtlmappingpipeline format)
+ * @return gene names in keys of a HashMap
+ * @throws IOException
+ */
+ public static HashMap getEqtls(String fname) throws IOException {
+ if (fname == null) {
+ return null;
+ }
+ TextFile file = new TextFile(fname, false);
+ ArrayList genes = file.readAsArrayList(4, TextFile.tab);
+ HashMap eqtlGenes = new HashMap();
+ for (String gene : genes) {
+ eqtlGenes.put(gene, null);
+ }
+ file.close();
+ return eqtlGenes;
+
+ }
+
+ public void interpretInteractionZScoreMatrix(int maxNumRegressedCovariates, int numPrimaryCovsToCorrect, int zscoreDiffThreshold) throws IOException {
+
+ System.out.println("Interpreting the z-score matrix");
+
+ for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) {
+ if (! new File(outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary.dat").exists()) {
+ ExpressionDataset dataset = new ExpressionDataset(outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt");
+ dataset.save(dataset.fileName + ".binary");
+ }
+ else {
+ System.out.println("Binary z-score matrix already exists, not overwriting it: " + outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary.dat");
+ }
+ }
+
+ TextFile out = new TextFile(outputDir + "zscoreDiff.txt", true);
+ out.writeln("numCovsRemoved\tcovariate\teQTL\tz-score_before\tz-score_after\tdifference");
+ for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) {
+
+ ExpressionDataset dataset = new ExpressionDataset(outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary");
+ ExpressionDataset dataset2 = new ExpressionDataset(outputDir + "/InteractionZScoresMatrix-" + (nrCovsRemoved + 1) + "Covariates.txt.binary");
+
+ for (int q = 0; q < dataset.nrSamples; q++) {
+ double maxAbsZDiff = 0;
+ String output = "";
+ for (int p = 0; p < dataset.nrProbes; p++) {
+ double zDiff = dataset.rawData[p][q] - dataset2.rawData[p][q];
+ double absZDiff = Math.abs(zDiff);
+ if (absZDiff > 2 && absZDiff > maxAbsZDiff) {
+ maxAbsZDiff = absZDiff;
+ output = nrCovsRemoved + "\t" + dataset.probeNames[p] + "\t" + dataset.sampleNames[q] + "\t" + dataset.rawData[p][q] + "\t" + dataset2.rawData[p][q] + "\t" + zDiff;
+ }
+ }
+ if (maxAbsZDiff > zscoreDiffThreshold) {
+ System.out.println(output);
+ out.writeln(output);
+ }
+ }
+ }
+ out.close();
+ }
+
+ public void findChi2SumDifferences(int maxNumRegressedCovariates, int numPrimaryCovsToCorrect, File ensgAnnotationFile) throws IOException {
+
+ Map ensgAnnotations;
+ if (ensgAnnotationFile == null) {
+ ensgAnnotations = Collections.emptyMap();
+ } else {
+ ensgAnnotations = readEnsgAnnotations(ensgAnnotationFile);
+ }
+
+ double[][] topCovZscores = null;
+ String[] topCovs = new String[maxNumRegressedCovariates];
+ String[] genes = null;
+
+ System.out.println("Interpreting the z-score matrix");
+ System.out.println("Preparing the data");
+ for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) {
+
+ if (new File(inputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary.dat").exists()) {
+ System.out.println("");
+ System.out.println("USING EXISTING BINARY FILE!!!!");
+ System.out.println("");
+ continue;
+ }
+
+ ExpressionDataset dataset = new ExpressionDataset(inputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt");
+ dataset.save(dataset.fileName + ".binary");
+ }
+
+ System.out.println("Comparing chi2sums");
+
+ double[] previousChi2 = null;
+ String[][] output = null;
+ boolean firstDataset = true;
+ String[] header = null;
+ double topCovChi2;
+ String topCov = "Technical";
+ int topCovI = -1;
+
+ for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) {
+
+ ExpressionDataset dataset = new ExpressionDataset(inputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary");
+
+ if (firstDataset) {
+ previousChi2 = new double[dataset.nrProbes];
+ output = new String[dataset.nrProbes][(maxNumRegressedCovariates * 2) + 5];
+ for (int covariate = 0; covariate < dataset.nrProbes; covariate++) {
+ output[covariate][0] = dataset.probeNames[covariate];
+ GeneAnnotation geneAnnotation = ensgAnnotations.get(dataset.probeNames[covariate]);
+ if (geneAnnotation == null) {
+ output[covariate][1] = "";
+ output[covariate][2] = "";
+ output[covariate][3] = "";
+ output[covariate][4] = "";
+ } else {
+ output[covariate][1] = geneAnnotation.getHuho();
+ output[covariate][2] = geneAnnotation.getChr();
+ output[covariate][3] = String.valueOf(geneAnnotation.getStart());
+ output[covariate][4] = String.valueOf(geneAnnotation.getEnd());
+ }
+
+ }
+ header = new String[(maxNumRegressedCovariates * 2) + 5];
+ header[0] = "Covariate gene";
+ header[1] = "Gene symbol";
+ header[2] = "Chr";
+ header[3] = "Start";
+ header[4] = "End";
+
+ genes = dataset.sampleNames;
+ topCovZscores = new double[genes.length][maxNumRegressedCovariates];
+ }
+
+ int outputColOffset = 5 + (nrCovsRemoved - numPrimaryCovsToCorrect) * 2;
+
+ header[outputColOffset] = topCov + "_removed_chi2sum";
+ header[1 + outputColOffset] = "Difference";
+
+ topCovChi2 = 0;
+
+ for (int covariate = 0; covariate < dataset.nrProbes; covariate++) {
+ double chi2Sum = 0;
+ double[] covariateData = dataset.rawData[covariate];
+ for (int gene = 0; gene < dataset.nrSamples; gene++) {
+ chi2Sum += covariateData[gene] * covariateData[gene];
+ }
+
+ if (chi2Sum > topCovChi2 && !dataset.probeNames[covariate].startsWith("Comp") && !dataset.probeNames[covariate].equals("LLS") && !dataset.probeNames[covariate].equals("LLdeep") && !dataset.probeNames[covariate].equals("RS") && !dataset.probeNames[covariate].equals("CODAM")) {
+ topCovChi2 = chi2Sum;
+ topCov = dataset.probeNames[covariate];
+ topCovI = covariate;
+ }
+
+ output[covariate][outputColOffset] = String.valueOf(chi2Sum);
+ output[covariate][1 + outputColOffset] = firstDataset ? "0" : String.valueOf(previousChi2[covariate] - chi2Sum);
+ previousChi2[covariate] = chi2Sum;
+
+ //System.out.println(nrCovsRemoved + "\t" + dataset.probeNames[covariate] + "\t" + chi2Sum1 + "\t" + chi2Sum2 + "\t" + (chi2Sum1 - chi2Sum2));
+ }
+
+ topCovs[nrCovsRemoved - numPrimaryCovsToCorrect] = topCov;
+ double[] covariateData = dataset.rawData[topCovI];
+ for (int gene = 0; gene < dataset.nrSamples; gene++) {
+ topCovZscores[gene][nrCovsRemoved - numPrimaryCovsToCorrect] = covariateData[gene];
+ }
+
+ firstDataset = false;
+
+ }
+
+ CSVWriter writer = new CSVWriter(new FileWriter(outputDir + "/chi2diff.txt"), '\t', CSVWriter.NO_QUOTE_CHARACTER);
+ writer.writeNext(header);
+ for (String[] row : output) {
+ writer.writeNext(row);
+ }
+ writer.close();
+
+ ExpressionDataset topCovZDataset = new ExpressionDataset(genes.length, topCovs.length);
+ topCovZDataset.rawData = topCovZscores;
+ topCovZDataset.probeNames = genes;
+ topCovZDataset.sampleNames = topCovs;
+ topCovZDataset.save(outputDir + "/topCovZ.txt");
+
+
+ }
+
+ public void preprocessData() {
+
+ HashMap hashGenotypes = new HashMap();
+ HashMap hashExpression = new HashMap();
+ HashMap hashEQTLs = new HashMap();
+ ArrayList snps = new ArrayList();
+ int countExcludedLines = 0;
+
+ try {
+ java.io.BufferedReader in = new java.io.BufferedReader(new java.io.FileReader(new File(inputDir + "/bigTableLude.txt")));
+ String str = in.readLine();
+ String[] data = str.split("\t");
+ for (int d = 0; d < data.length; d++) {
+ System.out.println(d + "\t" + data[d]);
+ if (data[d].endsWith("_dosage")) {
+ hashGenotypes.put(data[d], null);
+ }
+ if (data[d].endsWith("_exp")) {
+ hashExpression.put(data[d], null);
+ }
+ }
+ int itr = 0;
+ while ((str = in.readLine()) != null) {
+ if (!str.contains("NA")) {
+ data = str.split("\t");
+ hashEQTLs.put(data[0], null);
+ snps.add(data[1]);
+ itr++;
+ if (itr % 100 == 0) {
+ System.out.println(itr);
+ }
+ } else {
+ ++countExcludedLines;
+ }
+ }
+ } catch (Exception e) {
+ System.out.println("Error:\t" + e.getMessage());
+ e.printStackTrace();
+ }
+
+ System.out.println("EXCLUDED LINES: " + countExcludedLines);
+
+ ExpressionDataset datasetGenotypes = new ExpressionDataset(inputDir + "/bigTableLude.txt", '\t', hashEQTLs, hashGenotypes);
+ datasetGenotypes.probeNames = snps.toArray(new String[snps.size()]);
+ datasetGenotypes.recalculateHashMaps();
+
+ ExpressionDataset datasetExpression = new ExpressionDataset(inputDir + "/bigTableLude.txt", '\t', hashEQTLs, hashExpression);
+ datasetGenotypes.save(datasetGenotypes.fileName + ".Genotypes.binary");
+ datasetExpression.save(datasetGenotypes.fileName + ".Expression.binary");
+
+ ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt");
+ datasetCovariates.save(datasetCovariates.fileName + ".Covariates.binary");
+ System.exit(0);
+
+ }
+
+ public final String performInteractionAnalysis(String[] covsToCorrect, String[] covsToCorrect2, TextFile outputTopCovs, File snpsToSwapFile, HashMultimap qtlProbeSnpMultiMap, String[] covariatesToTest, HashMap hashSamples, int numThreads, final TIntHashSet snpsToTest, boolean skipNormalization, boolean skipCovariateNormalization, HashMultimap qtlProbeSnpMultiMapCovariates) throws IOException, Exception {
+
+ //hashSamples = excludeOutliers(hashSamples);
+
+ HashMap covariatesToLoad = new HashMap();
+ if (covariatesToTest != null) {
+ for (String c : covariatesToTest) {
+ covariatesToLoad.put(c, null);
+ }
+ for (String c : covsToCorrect) {
+ covariatesToLoad.put(c, null);
+ }
+ for (String c : covsToCorrect2) {
+ covariatesToLoad.put(c, null);
+ }
+ for (int i = 1; i <= 50; ++i) {
+ covariatesToLoad.put("Comp" + i, null);
+ }
+ } else {
+ covariatesToLoad = null;
+ }
+
+ ExpressionDataset datasetExpression = new ExpressionDataset(inputDir + "/bigTableLude.txt.Expression.binary", '\t', null, hashSamples);
+ ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', covariatesToLoad, hashSamples);
+
+ org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression();
+ int nrSamples = datasetGenotypes.nrSamples;
+
+ correctDosageDirectionForQtl(snpsToSwapFile, datasetGenotypes, datasetExpression);
+
+ if(!skipNormalization){
+ correctExpressionData(covsToCorrect2, datasetGenotypes, datasetCovariates, datasetExpression);
+ }
+
+
+
+ ExpressionDataset datasetCovariatesPCAForceNormal = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', covariatesToLoad, hashSamples);
+
+ if(!skipNormalization && !skipCovariateNormalization){
+ correctCovariateDataPCA(covsToCorrect2, covsToCorrect, datasetGenotypes, datasetCovariatesPCAForceNormal);
+ }
+
+
+ if (1 == 1) {
+
+ if (!skipNormalization && !skipCovariateNormalization && covsToCorrect2.length != 0 && covsToCorrect.length != 0) {
+ correctCovariateData(covsToCorrect2, covsToCorrect, datasetGenotypes, datasetCovariates);
+ }
+
+ if (!skipNormalization && !skipCovariateNormalization && !qtlProbeSnpMultiMapCovariates.isEmpty()) {
+ correctCovariatesForQtls(datasetCovariates, datasetGenotypes, qtlProbeSnpMultiMapCovariates);
+ }
+
+
+ if (1 == 2) {
+ saveCorrectedCovariates(datasetCovariates);
+ }
+
+ if (1 == 2) {
+ icaCovariates(datasetCovariates);
+ }
+ if(!skipNormalization){
+ forceNormalCovariates(datasetCovariates, datasetGenotypes);
+ }
+
+ }
+
+ ExpressionDataset datasetExpressionBeforeEQTLCorrection = new ExpressionDataset(datasetExpression.nrProbes, datasetExpression.nrSamples);
+ for (int p = 0; p < datasetExpression.nrProbes; p++) {
+ for (int s = 0; s < datasetExpression.nrSamples; s++) {
+ datasetExpressionBeforeEQTLCorrection.rawData[p][s] = datasetExpression.rawData[p][s];
+ }
+ }
+
+ if(!skipNormalization && covsToCorrect.length != 0){
+ correctExpressionDataForInteractions(covsToCorrect, datasetCovariates, datasetGenotypes, nrSamples, datasetExpression, regression, qtlProbeSnpMultiMap);
+ }
+
+ if(!skipNormalization){
+ forceNormalExpressionData(datasetExpression);
+ }
+
+ datasetExpression.save(outputDir + "/expressionDataRound_" + covsToCorrect.length + ".txt");
+ datasetExpression.save(outputDir + "/expressionDataRound_" + covsToCorrect.length + ".binary");
+ datasetCovariates.save(outputDir + "/covariateData_" + covsToCorrect.length + ".binary");
+
+
+
+
+ if (1 == 1) {
+
+
+
+ ExpressionDataset datasetZScores = new ExpressionDataset(datasetCovariates.nrProbes, datasetExpression.nrProbes);
+ datasetZScores.probeNames = datasetCovariates.probeNames;
+
+ datasetZScores.sampleNames = new String[datasetGenotypes.probeNames.length];
+ for (int i = 0; i < datasetGenotypes.probeNames.length; ++i) {
+ datasetZScores.sampleNames[i] = datasetGenotypes.probeNames[i] + datasetExpression.probeNames[i].substring(datasetExpression.probeNames[i].lastIndexOf('_'));
+ }
+
+ datasetZScores.recalculateHashMaps();
+
+ SkippedInteractionWriter skippedWriter = new SkippedInteractionWriter(new File(outputDir + "/skippedInteractionsRound_" + covsToCorrect.length + ".txt"));
+
+ java.util.concurrent.ExecutorService threadPool = Executors.newFixedThreadPool(numThreads);
+ CompletionService pool = new ExecutorCompletionService(threadPool);
+ int nrTasks = 0;
+ for (int cov = 0; cov < datasetCovariates.nrProbes; cov++) {
+ double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[cov]);
+ if (stdev > 0) {
+ PerformInteractionAnalysisPermutationTask task = new PerformInteractionAnalysisPermutationTask(datasetGenotypes, datasetExpression, datasetCovariates, datasetCovariatesPCAForceNormal, cov, skippedWriter, snpsToTest);
+ pool.submit(task);
+ nrTasks++;
+ }
+ }
+
+ String maxChi2Cov = "";
+ int maxChi2CovI = 0;
+ double maxChi2 = 0;
+ try {
+ // If gene annotation provided, for chi2sum calculation use only genes that are 1mb apart
+ //if (geneDistanceMap != null) {
+ for (int task = 0; task < nrTasks; task++) {
+ try {
+ //System.out.println("Waiting on thread for: " + datasetCovariates.probeNames[cov]);
+ DoubleArrayIntegerObject result = pool.take().get();
+ int cov = result.intValue;
+ double chi2Sum = 0;
+ double[] covZ = datasetZScores.rawData[cov];
+ for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) {
+ //if (genesFarAway(datasetZScores.sampleNames[snp], datasetZScores.probeNames[cov])) {
+ double z = result.doubleArray[snp];
+ covZ[snp] = z;
+ if (!Double.isNaN(z)) {
+ chi2Sum += z * z;
+ }
+ //}
+ }
+
+ if (chi2Sum > maxChi2 && !datasetCovariates.probeNames[cov].startsWith("Comp") && !datasetCovariates.probeNames[cov].equals("LLS") && !datasetCovariates.probeNames[cov].equals("LLdeep") && !datasetCovariates.probeNames[cov].equals("RS") && !datasetCovariates.probeNames[cov].equals("CODAM")) {
+ maxChi2 = chi2Sum;
+ maxChi2CovI = cov;
+ maxChi2Cov = datasetCovariates.probeNames[cov];
+ }
+ //System.out.println(covsToCorrect.length + "\t" + cov + "\t" + datasetCovariates.probeNames[cov] + "\t" + chi2Sum);
+ if ((task + 1) % 512 == 0) {
+ System.out.println(task + 1 + " tasks processed");
+ }
+ } catch (ExecutionException ex) {
+ Logger.getLogger(PerformInteractionAnalysisPermutationTask.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ /*} //If gene annotation not provided, use all gene pairs
+ else {
+ for (int task = 0; task < nrTasks; task++) {
+ try {
+ DoubleArrayIntegerObject result = pool.take().get();
+ int cov = result.intValue;
+ double chi2Sum = 0;
+ double[] covZ = datasetZScores.rawData[cov];
+ for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) {
+ double z = result.doubleArray[snp];
+ covZ[snp] = z;
+ if (!Double.isNaN(z)) {
+ chi2Sum += z * z;
+ }
+ }
+ if (chi2Sum > maxChi2) {
+ maxChi2 = chi2Sum;
+ maxChi2Cov = datasetCovariates.probeNames[cov];
+ }
+ //System.out.println(covsToCorrect.length + "\t" + cov + "\t" + datasetCovariates.probeNames[cov] + "\t" + chi2Sum);
+ if ((task + 1) % 512 == 0) {
+ System.out.println(task + 1 + " tasks processed");
+ }
+ } catch (ExecutionException ex) {
+ Logger.getLogger(PerformInteractionAnalysisPermutationTask.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+ }*/
+ threadPool.shutdown();
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.out.println(e.getMessage());
+ }
+
+ System.out.println("Top covariate:\t" + maxChi2 + "\t" + maxChi2Cov);
+ outputTopCovs.writeln("Top covariate:\t" + maxChi2 + "\t" + maxChi2Cov);
+ outputTopCovs.flush();
+ skippedWriter.close();
+ datasetZScores.save(outputDir + "/InteractionZScoresMatrix-" + covsToCorrect.length + "Covariates.txt");
+
+ BufferedWriter writer = new BufferedWriter(new FileWriter(outputDir + "/" + "topCov" + maxChi2Cov + "_expression.txt"));
+ double[] topCovExpression = datasetCovariates.rawData[maxChi2CovI];
+ for (int i = 0; i < topCovExpression.length; ++i) {
+ writer.append(datasetCovariates.sampleNames[i]);
+ writer.append('\t');
+ writer.append(String.valueOf(topCovExpression[i]));
+ writer.append('\n');
+ }
+ writer.close();
+
+ return maxChi2Cov;
+ }
+
+ return null;
+ }
+
+ /**
+ * Creates a map of gene name to GenomicBoundary containing gene coordinates
+ * and the coordinate of its midpoint as annotation
+ *
+ * @param annotFname - path to the annotation file (in the
+ * eqtlmappingpipeline format)
+ * @throws IOException
+ */
+ private void createGeneDistanceMap(String annotFname) throws IOException {
+ System.out.println("Creating a gene distance map from " + annotFname);
+
+ geneDistanceMap = new HashMap>();
+
+ TextFile annotFile = new TextFile(annotFname, false);
+ String els[] = annotFile.readLineElems(TextFile.tab);
+
+ while ((els = annotFile.readLineElems(TextFile.tab)) != null) {
+ int start = Integer.parseInt(els[4]), end = Integer.parseInt(els[5]), middle = start + (end - start) / 2;
+ GenomicBoundary genomicboundary = new GenomicBoundary(els[3], Integer.parseInt(els[4]), Integer.parseInt(els[5]), middle);
+ geneDistanceMap.put(els[1], genomicboundary);
+ }
+ annotFile.close();
+ }
+
+ /**
+ * Checks if the genomic distance between 2 genes is more than 1mb
+ *
+ * @param gene1
+ * @param gene2
+ * @return true if the genes are more than 1mb apart
+ */
+ public boolean genesFarAway(String gene1, String gene2) {
+ // if one of the covariates is a technical bias or a cell count etc
+ if ((!gene1.startsWith("ENS")) || (!gene2.startsWith("ENS"))) {
+ return true;
+ }
+
+ GenomicBoundary gb1 = null, gb2 = null;
+ try {
+ gb1 = geneDistanceMap.get(gene1);
+ gb2 = geneDistanceMap.get(gene2);
+
+ if (gb1.getChromosome() != gb2.getChromosome()) {
+ return true;
+ }
+ if (Math.abs(gb1.getAnnotation() - gb2.getAnnotation()) > 1000000) {
+ return true;
+ }
+ } catch (Exception e) {
+ System.out.println("Error: gene annotation doesn't contain one of these genes: " + gene1 + " or " + gene2);
+ System.exit(1);
+ }
+ return false;
+ }
+
+ static public void orthogonalizeDataset(String inputFile) {
+
+ ExpressionDataset dataset = new ExpressionDataset(inputFile);
+ dataset.transposeDataset();
+ dataset.standardNormalizeData();
+ int nrVars = dataset.nrProbes;
+ int nrSamples = dataset.nrSamples;
+
+ double[][] matrix = new double[nrVars][nrSamples];
+ for (int s = 0; s < nrVars; s++) {
+ for (int sample = 0; sample < nrSamples; sample++) {
+ matrix[s][sample] = dataset.rawData[s][sample];
+ }
+ }
+ double[][] correlationMatrix = new double[nrVars][nrVars];
+ for (int p = 0; p < nrVars; p++) {
+ correlationMatrix[p][p] = 1d;
+ for (int q = p + 1; q < nrVars; q++) {
+ double covariance = 0;
+ for (int sample = 0; sample < nrSamples; sample++) {
+ covariance += matrix[p][sample] * matrix[q][sample];
+ }
+ covariance /= (double) (nrSamples - 1);
+ correlationMatrix[p][q] = covariance;
+ correlationMatrix[q][p] = covariance;
+ }
+ }
+ Jama.EigenvalueDecomposition eig = eigenValueDecomposition(correlationMatrix);
+ double[] eigenValues = eig.getRealEigenvalues();
+
+ double[][] eigenVectors = new double[correlationMatrix.length][correlationMatrix.length];
+ ExpressionDataset datasetEigenvectors = new ExpressionDataset(correlationMatrix.length, correlationMatrix.length);
+ ExpressionDataset datasetEigenvalues = new ExpressionDataset(correlationMatrix.length, 2);
+ for (int pca = 0; pca < correlationMatrix.length; pca++) {
+ datasetEigenvectors.probeNames[pca] = "Comp" + (pca + 1);
+ datasetEigenvalues.probeNames[pca] = "Comp" + (pca + 1);
+ datasetEigenvectors.sampleNames[pca] = dataset.probeNames[pca];
+ }
+ datasetEigenvalues.sampleNames[0] = "Eigenvalues";
+ datasetEigenvalues.sampleNames[1] = "ExplainedVariance";
+ for (int pca = 0; pca < correlationMatrix.length; pca++) {
+ datasetEigenvectors.rawData[pca] = getEigenVector(eig, pca);
+ datasetEigenvalues.rawData[pca][0] = eigenValues[eigenValues.length - 1 - pca];
+ datasetEigenvalues.rawData[pca][1] = getEigenValueVar(eigenValues, pca);
+ System.out.println(pca + "\tExplainedVariance:\t" + getEigenValueVar(eigenValues, pca) + "\tEigenvalue:\t" + eigenValues[eigenValues.length - 1 - pca]);
+ }
+ datasetEigenvectors.transposeDataset();
+ datasetEigenvectors.save(inputFile + ".Eigenvectors.txt");
+ datasetEigenvalues.save(inputFile + ".Eigenvalues.txt");
+
+ //Calculate principal components:
+ ExpressionDataset datasetPCs = new ExpressionDataset(dataset.nrSamples, correlationMatrix.length);
+ for (int pca = 0; pca < correlationMatrix.length; pca++) {
+ datasetPCs.sampleNames[pca] = "Comp" + (pca + 1);
+ }
+ for (int p = 0; p < datasetPCs.nrProbes; p++) {
+ datasetPCs.probeNames[p] = dataset.sampleNames[p];
+ }
+ for (int pca = 0; pca < correlationMatrix.length; pca++) {
+ for (int p = 0; p < dataset.nrProbes; p++) {
+ for (int s = 0; s < dataset.nrSamples; s++) {
+ datasetPCs.rawData[s][pca] += datasetEigenvectors.rawData[p][pca] * dataset.rawData[p][s];
+ }
+ }
+ }
+ datasetPCs.save(dataset.fileName + ".PrincipalComponents.txt");
+
+ ExpressionDataset datasetFactorloadings = new ExpressionDataset(correlationMatrix.length, correlationMatrix.length);
+ datasetPCs.transposeDataset();
+ for (int p = 0; p < dataset.nrProbes; p++) {
+ datasetFactorloadings.probeNames[p] = dataset.probeNames[p];
+ }
+ for (int pca = 0; pca < datasetPCs.nrProbes; pca++) {
+ datasetFactorloadings.sampleNames[pca] = "Comp" + (pca + 1);
+ for (int p = 0; p < dataset.nrProbes; p++) {
+ datasetFactorloadings.rawData[p][pca] = JSci.maths.ArrayMath.correlation(datasetPCs.rawData[pca], dataset.rawData[p]);
+ }
+ }
+ datasetFactorloadings.save(dataset.fileName + ".Factorloadings.txt");
+
+ }
+
+ static public ExpressionDataset orthogonalizeMatrix(ExpressionDataset dataset) {
+
+ dataset.standardNormalizeData();
+ int nrVars = dataset.nrProbes;
+ int nrSamples = dataset.nrSamples;
+ double[][] matrix = new double[nrVars][nrSamples];
+ for (int s = 0; s < nrVars; s++) {
+ for (int sample = 0; sample < nrSamples; sample++) {
+ matrix[s][sample] = dataset.rawData[s][sample];
+ }
+ }
+ double[][] correlationMatrix = new double[nrVars][nrVars];
+ for (int p = 0; p < nrVars; p++) {
+ correlationMatrix[p][p] = 1d;
+ for (int q = p + 1; q < nrVars; q++) {
+ double covariance = 0;
+ for (int sample = 0; sample < nrSamples; sample++) {
+ covariance += matrix[p][sample] * matrix[q][sample];
+ }
+ covariance /= (double) (nrSamples - 1);
+ if (covariance > 1) {
+ covariance = 1d;
+ }
+ if (covariance < -1) {
+ covariance = -1d;
+ }
+ correlationMatrix[p][q] = covariance;
+ correlationMatrix[q][p] = covariance;
+ }
+ }
+ Jama.EigenvalueDecomposition eig = eigenValueDecomposition(correlationMatrix);
+ double[] eigenValues = eig.getRealEigenvalues();
+ int nrCompsWithPositiveEigenvalues = 0;
+ for (int e = 0; e < eigenValues.length; e++) {
+ //System.out.println(e + "\t" + eigenValues[e]);
+ if (eigenValues[e] > 1e-10) {
+ nrCompsWithPositiveEigenvalues++;
+ }
+ }
+
+ ExpressionDataset datasetEigenvectors = new ExpressionDataset(correlationMatrix.length, correlationMatrix.length);
+ for (int pca = 0; pca < correlationMatrix.length; pca++) {
+ datasetEigenvectors.rawData[pca] = getEigenVector(eig, pca);
+ }
+ datasetEigenvectors.transposeDataset();
+
+ //Calculate principal components:
+ ExpressionDataset datasetPCs = new ExpressionDataset(dataset.nrSamples, nrCompsWithPositiveEigenvalues);
+ for (int pca = 0; pca < nrCompsWithPositiveEigenvalues; pca++) {
+ datasetPCs.sampleNames[pca] = "Comp" + (pca + 1);
+ }
+ for (int p = 0; p < datasetPCs.nrProbes; p++) {
+ datasetPCs.probeNames[p] = dataset.sampleNames[p];
+ }
+ for (int pca = 0; pca < nrCompsWithPositiveEigenvalues; pca++) {
+ for (int p = 0; p < dataset.nrProbes; p++) {
+ for (int s = 0; s < dataset.nrSamples; s++) {
+ datasetPCs.rawData[s][pca] += datasetEigenvectors.rawData[p][pca] * dataset.rawData[p][s];
+ }
+ }
+ }
+ datasetPCs.transposeDataset();
+ return datasetPCs;
+
+ }
+
+ static public double[] getLinearRegressionCoefficients(double[] xVal, double[] yVal) {
+ double n = (double) xVal.length;
+ double sumX = 0;
+ double sumXX = 0;
+ double sumY = 0;
+ double sumXY = 0;
+ for (int x = 0; x < xVal.length; x++) {
+ sumX += xVal[x];
+ sumXX += xVal[x] * xVal[x];
+ sumY += yVal[x];
+ sumXY += xVal[x] * yVal[x];
+ }
+ double sXX = sumXX - sumX * sumX / n;
+ double sXY = sumXY - sumX * sumY / n;
+ double a = sXY / sXX;
+ double b = (sumY - a * sumX) / n;
+ double[] regressionCoefficients = new double[2];
+ regressionCoefficients[0] = a;
+ regressionCoefficients[1] = b;
+ return regressionCoefficients;
+ }
+
+ static public Jama.EigenvalueDecomposition eigenValueDecomposition(double[][] data) {
+ Jama.Matrix m = new Jama.Matrix(data);
+ Jama.EigenvalueDecomposition eig = m.eig();
+ return eig;
+ }
+
+ static public double[] getEigenVector(Jama.EigenvalueDecomposition eig, double[] eigenValues, int pca) {
+ Jama.Matrix eigenValueMatrix = eig.getV();
+ double[][] eigenValueMat = eigenValueMatrix.getArray();
+ double[] eigenVector = new double[eigenValueMat.length];
+ for (int i = 0; i < eigenValueMat.length; i++) {
+ eigenVector[i] = eigenValueMat[i][eigenValueMat.length - 1 - pca]; // * Math.sqrt(eigenValues[eigenValues.length - 1 - pca]);
+ }
+ return eigenVector;
+ }
+
+ static public double[] getEigenVector(Jama.EigenvalueDecomposition eig, int pca) {
+ Jama.Matrix eigenValueMatrix = eig.getV();
+ double[][] eigenValueMat = eigenValueMatrix.getArray();
+ double[] eigenVector = new double[eigenValueMat.length];
+ for (int i = 0; i < eigenValueMat.length; i++) {
+ eigenVector[i] = eigenValueMat[i][eigenValueMat.length - 1 - pca]; // * Math.sqrt(eigenValues[eigenValues.length - 1 - pca]);
+ }
+ return eigenVector;
+ }
+
+ static public double getEigenValueVar(double[] eigenValues, int pca) {
+ double sumEigenvalues = 0.0;
+ for (Double d : eigenValues) {
+ sumEigenvalues += Math.abs(d);
+ }
+ double result = eigenValues[eigenValues.length - 1 - pca] / sumEigenvalues;
+ return result;
+ }
+
+ static public double[] getEigenVectorSVD(Jama.SingularValueDecomposition svd, double[] singularValues, int pca) {
+ Jama.Matrix eigenValueMatrix = svd.getV();
+ double[][] eigenValueMat = eigenValueMatrix.getArray();
+ double[] eigenVector = new double[eigenValueMat.length];
+ for (int i = 0; i < eigenValueMat.length; i++) {
+ eigenVector[i] = eigenValueMat[i][pca] * Math.sqrt(singularValues[pca]);
+ }
+ return eigenVector;
+ }
+
+ private void correctCovariatesForQtls(ExpressionDataset datasetCovariates, ExpressionDataset datasetGenotypes, HashMultimap qtlProbeSnpMultiMap) throws Exception {
+
+ System.out.println("Correcting covariate data for cis-eQTL effects:");
+
+ OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
+
+ HashMap snpMap = new HashMap(datasetGenotypes.nrProbes);
+ for (Map.Entry snpEntry : datasetGenotypes.hashProbes.entrySet()) {
+ try {
+ snpMap.put(snpEntry.getKey().substring(0, snpEntry.getKey().indexOf('_')), snpEntry.getValue());
+ } catch (Exception e) {
+ System.out.println(snpEntry.getKey());
+ throw e;
+ }
+ }
+
+ for (int p = 0; p < datasetCovariates.nrProbes; p++) {
+
+ String probe = datasetCovariates.probeNames[p];
+ Set probeQtls = qtlProbeSnpMultiMap.get(probe);
+
+// System.out.println("");
+// System.out.println("-------------------------------------");
+// System.out.println("");
+// System.out.println(probe + " with " + probeQtls.size() + " SNPs");
+// System.out.println("");
+
+
+ if (!probeQtls.isEmpty()) {
+
+ int snpsInData = 0;
+ for (String snp : probeQtls) {
+
+ Integer s = snpMap.get(snp);
+ if (s != null) {
+ ++snpsInData;
+ }
+
+ }
+
+ double[][] x = new double[datasetCovariates.nrSamples][snpsInData];
+
+ int k = 0;
+ for (String snp : probeQtls) {
+
+ Integer s = snpMap.get(snp);
+ if (s == null) {
+ continue;
+ //throw new Exception("Snp " + snp + " not found");
+ }
+ double[] snpData = datasetGenotypes.rawData[s];
+ for (int i = 0; i < datasetGenotypes.nrSamples; ++i) {
+ x[i][k] = snpData[i];
+ }
+
+ k++;
+ }
+
+
+
+// PearsonsCorrelation cor = new PearsonsCorrelation();
+
+// System.out.println("Before");
+// for(String snp : probeQtls){
+// Integer s = snpMap.get(snp);
+// System.out.println(snp + " - " + cor.correlation(datasetCovariates.rawData[p], datasetGenotypes.rawData[s]));
+// }
+
+ ols.newSampleData(datasetCovariates.rawData[p], x);
+ datasetCovariates.rawData[p] = ols.estimateResiduals();
+
+// System.out.println("After");
+// for(String snp : probeQtls){
+// Integer s = snpMap.get(snp);
+// System.out.println(snp + " - " + cor.correlation(datasetCovariates.rawData[p], datasetGenotypes.rawData[s]));
+// }
+
+ }
+
+ }
+
+
+
+
+// for (int p = 0; p < datasetCovariates.nrProbes; p++) {
+// if (datasetExpression.hashProbes.containsKey(datasetCovariates.probeNames[p])) {
+// int index = ((Integer) datasetExpression.hashProbes.get(datasetCovariates.probeNames[p])).intValue();
+// double[] rc = getLinearRegressionCoefficients(datasetGenotypes.rawData[index], datasetCovariates.rawData[p]);
+// for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+// datasetCovariates.rawData[p][s] -= rc[0] * datasetGenotypes.rawData[index][s];
+// }
+// }
+// }
+
+ }
+
+ private HashMap excludeOutliers(HashMap hashSamples) {
+ System.out.println("Removing outlier samples!!!");
+ HashMap hashCovariates = new HashMap();
+ hashCovariates.put("MEDIAN_5PRIME_BIAS", null);
+ hashCovariates.put("MEDIAN_3PRIME_BIAS", null);
+ ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, null);
+ hashSamples = new HashMap();
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ if (datasetCovariates.rawData[0][s] != 0) {
+ hashSamples.put(datasetCovariates.sampleNames[s], null);
+ }
+ }
+ datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, hashSamples);
+ HashMap hashSamplesToExclude = new HashMap();
+ if (1 == 1) {
+ int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_5PRIME_BIAS")).intValue();
+ double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]);
+ double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]);
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ double z = (datasetCovariates.rawData[index][s] - mean) / stdev;
+ if (Math.abs(z) > 3) {
+ hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null);
+ }
+ }
+ }
+ if (1 == 1) {
+ int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_3PRIME_BIAS")).intValue();
+ double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]);
+ double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]);
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ double z = (datasetCovariates.rawData[index][s] - mean) / stdev;
+ if (Math.abs(z) > 3) {
+ hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null);
+ }
+ }
+ }
+ hashSamples = new HashMap();
+ for (int s = 0; s < datasetCovariates.nrSamples; s++) {
+ if (!hashSamplesToExclude.containsKey(datasetCovariates.sampleNames[s])) {
+ hashSamples.put(datasetCovariates.sampleNames[s], null);
+ hashSamples.put(datasetCovariates.sampleNames[s] + "_exp", null);
+ hashSamples.put(datasetCovariates.sampleNames[s] + "_dosage", null);
+ }
+ }
+ return hashSamples;
+ }
+
+ private void correctCovariateData(String[] covsToCorrect2, String[] covsToCorrect, ExpressionDataset datasetGenotypes, ExpressionDataset datasetCovariates) throws Exception {
+
+ System.out.println("Correcting covariate data for cohort specific effects:");
+// String[] cohorts = {"LLDeep","LLS","RS","CODAM"};
+ ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect2.length + covsToCorrect.length, datasetGenotypes.nrSamples);
+ datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames;
+
+ HashMap hashCovsToCorrect = new HashMap();
+
+ for (int i = 0; i < covsToCorrect2.length; ++i) {
+ String cov = covsToCorrect2[i];
+ hashCovsToCorrect.put(cov, null);
+ Integer c = datasetCovariates.hashProbes.get(cov);
+ if (c == null) {
+ throw new Exception("Covariate not found: " + cov);
+ }
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesToCorrectFor.rawData[i][s] = datasetCovariates.rawData[c][s];
+ }
+ }
+
+// for (int p=0; p 1E-5) {
+ double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetCovariates.rawData[p]);
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariates.rawData[p][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s];
+ }
+ }
+ }
+ double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[p]);
+ double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[p]);
+ if (stdev < 1E-5) {
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariates.rawData[p][s] = mean;
+ }
+ }
+ }
+ }
+ }
+
+ private ExpressionDataset correctCovariateDataPCA(String[] covsToCorrect2, String[] covsToCorrect, ExpressionDataset datasetGenotypes, ExpressionDataset datasetCovariatesPCAForceNormal) throws Exception {
+
+ int nrCompsToCorrectFor = 25;
+
+ System.out.println("Preparing data for testing eQTL effects of SNPs on covariate data:");
+ System.out.println("Correcting covariate data for cohort specific effects:");
+
+ ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect2.length + covsToCorrect.length + nrCompsToCorrectFor, datasetGenotypes.nrSamples);
+ datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames;
+
+ // add covariates from the first list
+ HashMap hashCovsToCorrect = new HashMap();
+
+ // add covariates from the second list
+ for (int i = 0; i < covsToCorrect2.length; ++i) {
+ String cov = covsToCorrect2[i];
+ hashCovsToCorrect.put(cov, null);
+ Integer c = datasetCovariatesPCAForceNormal.hashProbes.get(cov);
+ if (c == null) {
+ throw new Exception("Covariate not found: " + cov);
+ }
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesToCorrectFor.rawData[i][s] = datasetCovariatesPCAForceNormal.rawData[c][s];
+ }
+ }
+
+ int[] covsToCorrectIndex = new int[covsToCorrect.length];
+ for (int c = 0; c < covsToCorrect.length; c++) {
+ hashCovsToCorrect.put(covsToCorrect[c], null);
+ covsToCorrectIndex[c] = ((Integer) datasetCovariatesPCAForceNormal.hashProbes.get(covsToCorrect[c])).intValue();
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesToCorrectFor.rawData[covsToCorrect2.length + c][s] = datasetCovariatesPCAForceNormal.rawData[covsToCorrectIndex[c]][s];
+ }
+ }
+
+ // add PCs
+ if (nrCompsToCorrectFor > 0) {
+ for (int comp = 0; comp < nrCompsToCorrectFor; comp++) {
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesToCorrectFor.rawData[covsToCorrect2.length + covsToCorrect.length + comp][s] = datasetCovariatesPCAForceNormal.rawData[datasetCovariatesPCAForceNormal.nrProbes - 51 + comp][s];
+ }
+ }
+ }
+
+ datasetCovariatesToCorrectFor.transposeDataset();
+
+ datasetCovariatesToCorrectFor.save(inputDir + "/CovariatesToCorrectFor.txt");
+ orthogonalizeDataset(inputDir + "/CovariatesToCorrectFor.txt");
+ datasetCovariatesToCorrectFor = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.PrincipalComponents.txt");
+ datasetCovariatesToCorrectFor.transposeDataset();
+ ExpressionDataset datasetCovariatesToCorrectForEigenvalues = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.Eigenvalues.txt");
+
+ for (int p = 0; p < datasetCovariatesPCAForceNormal.nrProbes; p++) {
+ if (!hashCovsToCorrect.containsKey(datasetCovariatesPCAForceNormal.probeNames[p])) {
+ for (int cov = 0; cov < datasetCovariatesToCorrectFor.nrProbes; cov++) {
+ if (datasetCovariatesToCorrectForEigenvalues.rawData[cov][0] > 1E-5) {
+ double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetCovariatesPCAForceNormal.rawData[p]);
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesPCAForceNormal.rawData[p][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s];
+ }
+ }
+ }
+ /*double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[p]);
+ double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[p]);
+ if (stdev < 1E-5) {
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesPCAForceNormal.rawData[p][s] = mean;
+ }
+ }*/
+ }
+ }
+
+ System.out.println("Enforcing normal distribution on covariates");
+
+ NaturalRanking ranker = new NaturalRanking();
+
+ for (int p = 0; p < datasetCovariatesPCAForceNormal.nrProbes; p++) {
+ //Rank order the expression values:
+ double[] values = new double[datasetCovariatesPCAForceNormal.nrSamples];
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ values[s] = datasetCovariatesPCAForceNormal.rawData[p][s];
+ }
+ double[] rankedValues = ranker.rank(values);
+ //Replace the original expression value with the standard distribution enforce:
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ //Convert the rank to a proportion, with range <0, 1>
+ double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length);
+ //Convert the pValue to a Z-Score:
+ double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue);
+ datasetCovariatesPCAForceNormal.rawData[p][s] = zScore; //Replace original expression value with the Z-Score
+ }
+ }
+ return datasetCovariatesPCAForceNormal;
+ }
+
+ private void correctExpressionData(String[] covsToCorrect2, ExpressionDataset datasetGenotypes, ExpressionDataset datasetCovariates, ExpressionDataset datasetExpression) throws Exception {
+ //Define a set of covariates that we want to use as correction:
+ System.out.println("Correcting gene expression data for cohort specific effects and top 25 components");
+ //String[] cohorts = {"LLDeep", "LLS", "RS", "CODAM"};
+ int nrCompsToCorrectFor = 25;
+ ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect2.length + nrCompsToCorrectFor, datasetGenotypes.nrSamples);
+ datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames;
+
+ for (int i = 0; i < covsToCorrect2.length; ++i) {
+ String cov = covsToCorrect2[i];
+ Integer c = datasetCovariates.hashProbes.get(cov);
+ if (c == null) {
+ throw new Exception("Covariate not found: " + cov);
+ }
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesToCorrectFor.rawData[i][s] = datasetCovariates.rawData[c][s];
+ }
+ }
+
+// for (int p = 0; p < cohorts.length; p++) {
+// for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+// if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) {
+// datasetCovariatesToCorrectFor.rawData[p][s] = 1;
+// }
+// }
+// }
+ if (nrCompsToCorrectFor > 0) {
+ for (int comp = 0; comp < nrCompsToCorrectFor; comp++) {
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetCovariatesToCorrectFor.rawData[covsToCorrect2.length + comp][s] = datasetCovariates.rawData[datasetCovariates.nrProbes - 51 + comp][s];
+ }
+ }
+ }
+
+ datasetCovariatesToCorrectFor.transposeDataset();
+
+ datasetCovariatesToCorrectFor.save(outputDir + "/CovariatesToCorrectFor.txt");
+ orthogonalizeDataset(outputDir + "/CovariatesToCorrectFor.txt");
+ datasetCovariatesToCorrectFor = new ExpressionDataset(outputDir + "/CovariatesToCorrectFor.txt.PrincipalComponents.txt");
+ datasetCovariatesToCorrectFor.transposeDataset();
+ ExpressionDataset datasetCovariatesToCorrectForEigenvalues = new ExpressionDataset(outputDir + "/CovariatesToCorrectFor.txt.Eigenvalues.txt");
+ for (int snp = 0; snp < datasetExpression.nrProbes; snp++) {
+ for (int cov = 0; cov < datasetCovariatesToCorrectFor.nrProbes; cov++) {
+ if (datasetCovariatesToCorrectForEigenvalues.rawData[cov][0] > 1E-5) {
+ double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetExpression.rawData[snp]);
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetExpression.rawData[snp][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s];
+ }
+ }
+ }
+ }
+ }
+
+ private void correctDosageDirectionForQtl(File snpsToSwapFile, ExpressionDataset datasetGenotypes, ExpressionDataset datasetExpression) throws IOException {
+ //double[] mainEQTLCorr = new double[datasetGenotypes.nrProbes];
+
+
+ if (snpsToSwapFile != null) {
+ System.out.println("Enforcing for every eQTL that the genotype dosage is swapped based on: " + snpsToSwapFile.getAbsolutePath());
+
+ HashSet snpsToSwap = new HashSet();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(snpsToSwapFile), "UTF-8"));
+ String line;
+ while ((line = reader.readLine()) != null) {
+ snpsToSwap.add(line);
+ }
+ reader.close();
+
+ for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) {
+
+ if (snpsToSwap.contains(datasetGenotypes.probeNames[snp])) {
+
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetGenotypes.rawData[snp][s] = 2 - datasetGenotypes.rawData[snp][s];
+ }
+
+ }
+
+ //mainEQTLCorr[snp] = corr;
+ }
+
+
+ } else {
+ System.out.println("Enforcing for every eQTL that the genotype dosage positively correlated with gene expression levels:");
+
+ Writer writer = new BufferedWriter(new FileWriter(outputDir + "/swappedDosages.txt"));
+ for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) {
+ double corr = JSci.maths.ArrayMath.correlation(datasetGenotypes.rawData[snp], datasetExpression.rawData[snp]);
+ //System.out.println(datasetExpression.probeNames[snp] + "\t" + snp + "\t" + corr);
+
+ if (corr < 0) {
+ corr = -corr;
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ datasetGenotypes.rawData[snp][s] = 2 - datasetGenotypes.rawData[snp][s];
+ }
+ writer.append(datasetGenotypes.probeNames[snp]);
+ writer.append('\n');
+ }
+
+ //mainEQTLCorr[snp] = corr;
+ }
+ writer.close();
+
+ }
+ }
+
+ private void saveCorrectedCovariates(ExpressionDataset datasetCovariates) {
+ datasetCovariates.save(inputDir + "/CovariatesCorrected.txt");
+ HashMap hashProbesToFilter = new HashMap();
+ for (int p = 0; p < datasetCovariates.nrProbes; p++) {
+ if (datasetCovariates.probeNames[p].startsWith("ENSG")) {
+ hashProbesToFilter.put(datasetCovariates.probeNames[p], null);
+ }
+ }
+ ExpressionDataset datasetCovariatesCorrected = new ExpressionDataset(inputDir + "/CovariatesCorrected.txt", '\t', hashProbesToFilter, null);
+ datasetCovariatesCorrected.transposeDataset();
+ datasetCovariatesCorrected.save(inputDir + "/CovariatesCorrected.txt");
+ System.exit(0);
+ }
+
+ private void icaCovariates(ExpressionDataset datasetCovariates) {
+ ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/mixingmatrix.txt");
+ //ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/signals.txt");
+ datasetICA.transposeDataset();
+ for (int p = 0; p < datasetICA.nrProbes; p++) {
+ datasetCovariates.rawData[p] = datasetICA.rawData[p];
+ datasetCovariates.probeNames[p] = datasetICA.probeNames[p];
+ if (p == 7) {
+ for (int q = 0; q < datasetCovariates.nrProbes; q++) {
+ double corr = JSci.maths.ArrayMath.correlation(datasetICA.rawData[p], datasetCovariates.rawData[q]);
+ System.out.println(p + "\t" + datasetICA.probeNames[p] + "\t" + q + "\t" + datasetCovariates.probeNames[q] + "\t" + corr + "\t" + corr * corr);
+ }
+ }
+ }
+
+ orthogonalizeDataset("/Users/lude/Documents/ICA/mixingmatrix.txt");
+ //System.exit(0);
+ }
+
+ private void forceNormalCovariates(ExpressionDataset datasetCovariates, ExpressionDataset datasetGenotypes) throws ArithmeticException {
+ System.out.println("Enforcing normal distribution on covariates");
+
+ NaturalRanking ranker = new NaturalRanking();
+
+ for (int p = 0; p < datasetCovariates.nrProbes; p++) {
+ //Rank order the expression values:
+ double[] values = new double[datasetCovariates.nrSamples];
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ values[s] = datasetCovariates.rawData[p][s];
+ }
+ double[] rankedValues = ranker.rank(values);
+ //Replace the original expression value with the standard distribution enforce:
+ for (int s = 0; s < datasetGenotypes.nrSamples; s++) {
+ //Convert the rank to a proportion, with range <0, 1>
+ double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length);
+ //Convert the pValue to a Z-Score:
+ double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue);
+ datasetCovariates.rawData[p][s] = zScore; //Replace original expression value with the Z-Score
+ }
+ }
+ }
+
+ private void correctExpressionDataForInteractions(String[] covsToCorrect, ExpressionDataset datasetCovariates, ExpressionDataset datasetGenotypes, int nrSamples, ExpressionDataset datasetExpression, OLSMultipleLinearRegression regression, HashMultimap qtlProbeSnpMultiMap) throws MathIllegalArgumentException, Exception {
+
+ System.out.println("Correcting expression data for predefined gene environment interaction effects: " + Arrays.toString(covsToCorrect));
+ int[] covsToCorrectIndex = new int[covsToCorrect.length];
+ for (int c = 0; c < covsToCorrect.length; c++) {
+ covsToCorrectIndex[c] = ((Integer) datasetCovariates.hashProbes.get(covsToCorrect[c])).intValue();
+
+ }
+
+ HashMap snpMap = new HashMap(datasetGenotypes.nrProbes);
+ for (Map.Entry snpEntry : datasetGenotypes.hashProbes.entrySet()) {
+ try {
+ snpMap.put(snpEntry.getKey().substring(0, snpEntry.getKey().indexOf('_')), snpEntry.getValue());
+ } catch (Exception e) {
+ System.out.println(snpEntry.getKey());
+ throw e;
+ }
+ }
+
+ Variance v = new Variance();
+
+ for (int p = 0; p < datasetExpression.nrProbes; p++) {
+
+ String probe = datasetExpression.probeNames[p].substring(0, datasetExpression.probeNames[p].lastIndexOf('_'));
+ Set probeQtls = qtlProbeSnpMultiMap.get(probe);
+
+ if (probeQtls.isEmpty()) {
+ throw new Exception("No eQTLs found for: " + probe);
+ }
+
+ int snpsInData = 0;
+ HashSet excludedSnps = new HashSet();
+ for (String snp : probeQtls) {
+
+ Integer s = snpMap.get(snp);
+ if (s != null) {
+
+
+
+ if (v.evaluate(datasetGenotypes.rawData[s]) > 0) {
+ ++snpsInData;
+ } else {
+ excludedSnps.add(snp);
+ }
+
+ }
+
+
+
+ }
+
+ //boolean foundPisS = false;
+ double[][] valsX = new double[nrSamples][snpsInData + covsToCorrect.length * 2]; //store genotypes, covariates, interactions
+ int k = 0;
+ for (String snp : probeQtls) {
+
+ if (excludedSnps.contains(snp)) {
+ continue;
+ }
+
+ Integer s = snpMap.get(snp);
+ if (s == null) {
+ //throw new Exception("Snp " + snp + " not found");
+ continue;
+ }
+// if(s.intValue() == p){
+// foundPisS = true;
+// }
+ double[] snpData = datasetGenotypes.rawData[s];
+ for (int i = 0; i < datasetGenotypes.nrSamples; ++i) {
+ valsX[i][k] = snpData[i];
+ }
+
+ k++;
+ }
+// if(!foundPisS){
+//
+// System.out.println("Expected snp: " + datasetGenotypes.probeNames[p] + " at index: " + p);
+//
+// for(String qtlSnp : probeQtls = qtlProbeSnpMultiMap.get(probe)){
+// System.out.println("QTL snp: " + qtlSnp + " found at index: " + snpMap.get(qtlSnp));
+// }
+//
+// throw new Exception("Error 2");
+// }
+ for (int c = 0; c < covsToCorrect.length; c++) {
+ double[] covData = datasetCovariates.rawData[covsToCorrectIndex[c]];
+ double[] snpData = datasetGenotypes.rawData[p];
+
+ for (int s = 0; s < nrSamples; s++) {
+ valsX[s][c * 2 + snpsInData] = covData[s]; //covariate
+ valsX[s][c * 2 + snpsInData + 1] = snpData[s] * covData[s]; //interction
+ }
+ }
+ double[] valsY = datasetExpression.rawData[p];
+ regression.newSampleData(valsY, valsX);
+ try {
+ datasetExpression.rawData[p] = regression.estimateResiduals();
+ } catch (Exception up) {
+ System.err.println("Error correcting for interactions: " + probe + " - " + datasetGenotypes.probeNames[p]);
+ }
+ }
+ }
+
+ private void forceNormalExpressionData(ExpressionDataset datasetExpression) throws ArithmeticException {
+ System.out.println("Enforcing normal distribution on expression data:");
+
+ NaturalRanking ranker = new NaturalRanking();
+
+ for (int p = 0; p < datasetExpression.nrProbes; p++) {
+ //Rank order the expression values:
+ double[] values = new double[datasetExpression.nrSamples];
+ for (int s = 0; s < datasetExpression.nrSamples; s++) {
+ values[s] = datasetExpression.rawData[p][s];
+ }
+
+ double[] rankedValues = ranker.rank(values);
+ //Replace the original expression value with the standard distribution enforce:
+ for (int s = 0; s < datasetExpression.nrSamples; s++) {
+ //Convert the rank to a proportion, with range <0, 1>
+ double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length);
+ //Convert the pValue to a Z-Score:
+ double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue);
+ datasetExpression.rawData[p][s] = zScore; //Replace original expression value with the Z-Score
+ }
+ }
+
+ System.out.println("Expression data now force normal");
+ }
+
+ private HashMap readEnsgAnnotations(File ensgAnnotationFile) throws FileNotFoundException, IOException {
+ final HashMap ensgAnnotations = new HashMap();
+ CSVReader refReader = new CSVReader(new FileReader(ensgAnnotationFile), '\t', '\0', '\0');
+ refReader.readNext();
+ String[] nextLine;
+ while ((nextLine = refReader.readNext()) != null) {
+ ensgAnnotations.put(nextLine[0], new GeneAnnotation(nextLine[0], nextLine[1], nextLine[2], Integer.valueOf(nextLine[3]), Integer.valueOf(nextLine[4])));
+ }
+ return ensgAnnotations;
+ }
+}
\ No newline at end of file
diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/VectorSorter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/VectorSorter.java
new file mode 100644
index 000000000..3915cdb0c
--- /dev/null
+++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/VectorSorter.java
@@ -0,0 +1,129 @@
+/*
+ * VectorSorter.java
+ *
+ * Created on 23 December 2003, 17:02
+ */
+
+package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser;
+
+ import java.util.*;
+ import java.io.*;
+
+
+/** An implementation of a quick sort algorithm.
+ *
+ * Shamelessly taken from Martin Senger's (senger@ebi.ac.uk) Java collection
+ *
+ * It sorts a vector of strings, but can be easily extended to sort
+ * a vector of other objects (by overwritting just one method
+ * lt(Object,Object)).
+ *
+ * It implements a generic version of C.A.R Hoare's Quick Sort
+ * algorithm.
+ *
+ * The code is based on example given in java swing package.
+ *
+ *
+ * This is an example how to use this class to sort a vector of strings:
+ *
+ * Vector v = new Vector();
+ * v.addElement ("X");
+ * v.addElement ("A");
+ * new Sorter().sort (v);
+ *
+ *
+ * @author Martin Senger
+ * @version $Id: VectorSorter.java,v 1.1.1.1 2004/01/26 09:27:02 lude Exp $
+ * @see TestSorter
+ */
+public class VectorSorter {
+
+ /****************************************************************************
+ * A default constructor. It does nothing.
+ ****************************************************************************/
+ public VectorSorter() {
+ }
+
+ /****************************************************************************
+ * Sort the given vector.
+ * By default it is assumed that the vector contains elements of type String.
+ * If not a subclass must be written which overwrites method
+ * lt(Object,Object).
+ *
+ * @param v a vector to be sorted
+ ****************************************************************************/
+ public void sort (Vector v) {
+ quickSort (v, 0, v.size() - 1);
+ }
+
+ /****************************************************************************
+ * Compare two objects.
+ *
+ * By default this method works for Strings. It is meant to be overwritten
+ * for other objects.
+ *